| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>NULL Semantics - Spark 3.0.0-preview2 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/bootstrap-responsive.min.css"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <div class="navbar navbar-fixed-top" id="topbar"> |
| <div class="navbar-inner"> |
| <div class="container"> |
| <div class="brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.0.0-preview2</span> |
| </div> |
| <ul class="nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li><a href="index.html">Overview</a></li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="quick-start.html">Quick Start</a></li> |
| <li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li> |
| <li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li> |
| <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li> |
| <li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li> |
| <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li> |
| <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> |
| <li><a href="sparkr.html">SparkR (R on Spark)</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li> |
| <li><a href="api/java/index.html">Java</a></li> |
| <li><a href="api/python/index.html">Python</a></li> |
| <li><a href="api/R/index.html">R</a></li> |
| <li><a href="api/sql/index.html">SQL, Built-in Functions</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="cluster-overview.html">Overview</a></li> |
| <li><a href="submitting-applications.html">Submitting Applications</a></li> |
| <li class="divider"></li> |
| <li><a href="spark-standalone.html">Spark Standalone</a></li> |
| <li><a href="running-on-mesos.html">Mesos</a></li> |
| <li><a href="running-on-yarn.html">YARN</a></li> |
| <li><a href="running-on-kubernetes.html">Kubernetes</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="configuration.html">Configuration</a></li> |
| <li><a href="monitoring.html">Monitoring</a></li> |
| <li><a href="tuning.html">Tuning Guide</a></li> |
| <li><a href="job-scheduling.html">Job Scheduling</a></li> |
| <li><a href="security.html">Security</a></li> |
| <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li> |
| <li><a href="migration-guide.html">Migration Guide</a></li> |
| <li class="divider"></li> |
| <li><a href="building-spark.html">Building Spark</a></li> |
| <li><a href="https://spark.apache.org/contributing.html">Contributing to Spark</a></li> |
| <li><a href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a></li> |
| </ul> |
| </li> |
| </ul> |
| <!--<p class="navbar-text pull-right"><span class="version-text">v3.0.0-preview2</span></p>--> |
| </div> |
| </div> |
| </div> |
| |
| <div class="container-wrapper"> |
| |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="sql-getting-started.html"> |
| |
| Getting Started |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources.html"> |
| |
| Data Sources |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-performance-tuning.html"> |
| |
| Performance Tuning |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-distributed-sql-engine.html"> |
| |
| Distributed SQL Engine |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-pyspark-pandas-with-arrow.html"> |
| |
| PySpark Usage Guide for Pandas with Apache Arrow |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-migration-old.html"> |
| |
| Migration Guide |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref.html"> |
| |
| SQL Reference |
| |
| </a> |
| </li> |
| |
| |
| |
| <ul> |
| |
| <li> |
| <a href="sql-ref-datatypes.html"> |
| |
| Data Types |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref-null-semantics.html"> |
| |
| <b>Null Semantics</b> |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref-nan-semantics.html"> |
| |
| NaN Semantics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref-syntax.html"> |
| |
| SQL Syntax |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref-functions.html"> |
| |
| Functions |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref-arithmetic-ops.html"> |
| |
| Arthmetic operations |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar" id="content"> |
| |
| <h1 class="title">NULL Semantics</h1> |
| |
| |
| <h3 id="description">Description</h3> |
| <p>A table consists of a set of rows and each row contains a set of columns. |
| A column is associated with a data type and represents |
| a specific attribute of an entity (for example, <code class="highlighter-rouge">age</code> is a column of an |
| entity called <code class="highlighter-rouge">person</code>). Sometimes, the value of a column |
| specific to a row is not known at the time the row comes into existence. |
| In <code class="highlighter-rouge">SQL</code>, such values are represnted as <code class="highlighter-rouge">NULL</code>. This section details the |
| semantics of <code class="highlighter-rouge">NULL</code> values handling in various operators, expressions and |
| other <code class="highlighter-rouge">SQL</code> constructs.</p> |
| |
| <ol> |
| <li><a href="#comp-operators">Null handling in comparison operators</a></li> |
| <li><a href="#logical-operators">Null handling in Logical operators</a></li> |
| <li><a href="#expressions">Null handling in Expressions</a> |
| <ol> |
| <li><a href="#null-in-tolerant">Null handling in null-in-tolerant expressions</a></li> |
| <li><a href="#can-process-null">Null handling Expressions that can process null value operands</a></li> |
| <li><a href="#built-in-aggregate">Null handling in built-in aggregate expressions</a></li> |
| </ol> |
| </li> |
| <li><a href="#condition-expressions">Null handling in WHERE, HAVING and JOIN conditions</a></li> |
| <li><a href="#aggregate-operator">Null handling in GROUP BY and DISTINCT</a></li> |
| <li><a href="#order-by">Null handling in ORDER BY</a></li> |
| <li><a href="#set-operators">Null handling in UNION, INTERSECT, EXCEPT</a></li> |
| <li><a href="#exists-not-exists">Null handling in EXISTS and NOT EXISTS subquery</a></li> |
| <li><a href="#in-not-in">Null handling in IN and NOT IN subquery</a></li> |
| </ol> |
| |
| <style type="text/css"> |
| .tsclass {font-size:12px;color:#333333;width:40%;border-width: 2px;border-color: #729ea5;border-collapse: collapse;} |
| .tsclass th {text-align: left;} |
| </style> |
| |
| <p>The following illustrates the schema layout and data of a table named <code class="highlighter-rouge">person</code>. The data contains <code class="highlighter-rouge">NULL</code> values in |
| the <code class="highlighter-rouge">age</code> column and this table will be used in various examples in the sections below. |
| <strong><u>TABLE: person</u></strong></p> |
| <table class="tsclass" border="1"> |
| <tr><th>Id</th><th>Name</th><th>Age</th></tr> |
| <tr><td>100</td><td>Joe</td><td>30</td></tr> |
| <tr><td>200</td><td>Marry</td><td>NULL</td></tr> |
| <tr><td>300</td><td>Mike</td><td>18</td></tr> |
| <tr><td>400</td><td>Fred</td><td>50</td></tr> |
| <tr><td>500</td><td>Albert</td><td>NULL</td></tr> |
| <tr><td>600</td><td>Michelle</td><td>30</td></tr> |
| <tr><td>700</td><td>Dan</td><td>50</td></tr> |
| </table> |
| |
| <h3 id="comparision-operators-">Comparision operators <a name="comp-operators"></a></h3> |
| |
| <p>Apache spark supports the standard comparison operators such as ‘>’, ‘>=’, ‘=’, ‘<’ and ‘<=’. |
| The result of these operators is unknown or <code class="highlighter-rouge">NULL</code> when one of the operarands or both the operands are |
| unknown or <code class="highlighter-rouge">NULL</code>. In order to compare the <code class="highlighter-rouge">NULL</code> values for equality, Spark provides a null-safe |
| equal operator (‘<=>’), which returns <code class="highlighter-rouge">False</code> when one of the operand is <code class="highlighter-rouge">NULL</code> and returns ‘True<code class="highlighter-rouge"> when |
| both the operands are </code>NULL<code class="highlighter-rouge">. The following table illustrates the behaviour of comparison operators when |
| one or both operands are </code>NULL`:</p> |
| |
| <table class="tsclass" border="1"> |
| <tr> |
| <th>Left Operand</th> |
| <th>Right Operand</th> |
| <th>></th> |
| <th>>=</th> |
| <th>=</th> |
| <th><</th> |
| <th><=</th> |
| <th><=></th> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>Any value</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>False</td> |
| </tr> |
| <tr> |
| <td>Any value</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>False</td> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>True</td> |
| </tr> |
| </table> |
| |
| <h3 id="examples">Examples</h3> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Normal comparison operators return `NULL` when one of the operand is `NULL`.</span> |
| <span class="k">SELECT</span> <span class="mi">5</span> <span class="o">></span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span> |
| <span class="k">SELECT</span> <span class="k">null</span> <span class="o">=</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Null-safe equal operator return `False` when one of the operand is `NULL`</span> |
| <span class="k">SELECT</span> <span class="mi">5</span> <span class="o"><=></span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">false</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Null-safe equal operator return `True` when one of the operand is `NULL`</span> |
| <span class="k">SELECT</span> <span class="k">NULL</span> <span class="o"><=></span> <span class="k">NULL</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">true</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure> |
| |
| <h3 id="logical-operators-">Logical operators <a name="logical-operators"></a></h3> |
| <p>Spark supports standard logical operators such as <code class="highlighter-rouge">AND</code>, <code class="highlighter-rouge">OR</code> and <code class="highlighter-rouge">NOT</code>. These operators take <code class="highlighter-rouge">Boolean</code> expressions |
| as the arguments and return a <code class="highlighter-rouge">Boolean</code> value.</p> |
| |
| <p>The following tables illustrate the behavior of logical opeators when one or both operands are <code class="highlighter-rouge">NULL</code>.</p> |
| |
| <table class="tsclass" border="1"> |
| <tr> |
| <th>Left Operand</th> |
| <th>Right Operand</th> |
| <th>OR</th> |
| <th>AND</th> |
| </tr> |
| <tr> |
| <td>True</td> |
| <td>NULL</td> |
| <td>True</td> |
| <td>NULL</td> |
| </tr> |
| <tr> |
| <td>False</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>False</td> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>True</td> |
| <td>True</td> |
| <td>NULL</td> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>False</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| <td>NULL</td> |
| </tr> |
| </table> |
| <p><br /></p> |
| <table class="tsclass" border="1"> |
| <tr> |
| <th>operand</th> |
| <th>NOT</th> |
| </tr> |
| <tr> |
| <td>NULL</td> |
| <td>NULL</td> |
| </tr> |
| </table> |
| |
| <h3 id="examples-1">Examples</h3> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Normal comparison operators return `NULL` when one of the operands is `NULL`.</span> |
| <span class="k">SELECT</span> <span class="p">(</span><span class="k">true</span> <span class="k">OR</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">true</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span> |
| <span class="k">SELECT</span> <span class="p">(</span><span class="k">null</span> <span class="k">OR</span> <span class="k">false</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Null-safe equal operator returns `False` when one of the operands is `NULL`</span> |
| <span class="k">SELECT</span> <span class="k">NOT</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure> |
| |
| <h3 id="expressions-">Expressions <a name="expressions"></a></h3> |
| <p>The comparison operators and logical operators are treated as expressions in |
| Spark. Other than these two kinds of expressions, Spark supports other form of |
| expressions such as function expressions, cast expressions, etc. The expressions |
| in Spark can be broadly classified as :</p> |
| <ul> |
| <li>Null in-tolerent expressions</li> |
| <li>Expressions that can process <code class="highlighter-rouge">NULL</code> value operands |
| <ul> |
| <li>The result of these expressions depends on the expression itself.</li> |
| </ul> |
| </li> |
| </ul> |
| |
| <h4 id="null-in-tolerant-expressions-">Null in-tolerant expressions <a name="null-in-tolerant"></a></h4> |
| <p>Null in-tolerant expressions return <code class="highlighter-rouge">NULL</code> when one or more arguments of |
| expression are <code class="highlighter-rouge">NULL</code> and most of the expressions fall in this category.</p> |
| |
| <h5 id="examples-2">Examples</h5> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">SELECT</span> <span class="n">concat</span><span class="p">(</span><span class="s1">'John'</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="k">SELECT</span> <span class="n">positive</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="k">SELECT</span> <span class="n">to_date</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure> |
| |
| <h4 id="expressions-that-can-process-null-value-operands-">Expressions that can process null value operands. <a name="can-process-null"></a></h4> |
| |
| <p>This class of expressions are designed to handle <code class="highlighter-rouge">NULL</code> values. The result of the |
| expressions depends on the expression itself. As an example, function expression <code class="highlighter-rouge">isnull</code> |
| returns a <code class="highlighter-rouge">true</code> on null input and <code class="highlighter-rouge">false</code> on non null input where as function <code class="highlighter-rouge">coalesce</code> |
| returns the first non <code class="highlighter-rouge">NULL</code> value in its list of operands. However, <code class="highlighter-rouge">coalesce</code> returns |
| <code class="highlighter-rouge">NULL</code> when all its operands are <code class="highlighter-rouge">NULL</code>. Below is an incomplete list of expressions of this category.</p> |
| <ul> |
| <li>COALESCE</li> |
| <li>NULLIF</li> |
| <li>IFNULL</li> |
| <li>NVL</li> |
| <li>NVL2</li> |
| <li>ISNAN</li> |
| <li>NANVL</li> |
| <li>ISNULL</li> |
| <li>ISNOTNULL</li> |
| <li>ATLEASTNNONNULLS</li> |
| <li>IN</li> |
| </ul> |
| |
| <h5 id="examples-3">Examples</h5> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">SELECT</span> <span class="k">isnull</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">true</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Returns the first occurence of non `NULL` value.</span> |
| <span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="mi">3</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="c1">-- Returns `NULL` as all its operands are `NULL`. </span> |
| <span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| |
| <span class="k">SELECT</span> <span class="n">isnan</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="n">expression_output</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span> |
| <span class="o">|</span><span class="k">false</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure> |
| |
| <h4 id="builtin-aggregate-expressions-">Builtin Aggregate Expressions <a name="built-in-aggregate"></a></h4> |
| <p>Aggregate functions compute a single result by processing a set of input rows. Below are |
| the rules of how <code class="highlighter-rouge">NULL</code> values are handled by aggregate functions.</p> |
| <ul> |
| <li><code class="highlighter-rouge">NULL</code> values are ignored from processing by all the aggregate functions. |
| <ul> |
| <li>Only exception to this rule is COUNT(*) function.</li> |
| </ul> |
| </li> |
| <li>Some aggregate functions return <code class="highlighter-rouge">NULL</code> when all input values are <code class="highlighter-rouge">NULL</code> or the input data set |
| is empty.<br /> The list of these functions is: |
| <ul> |
| <li>MAX</li> |
| <li>MIN</li> |
| <li>SUM</li> |
| <li>AVG</li> |
| <li>EVERY</li> |
| <li>ANY</li> |
| <li>SOME</li> |
| </ul> |
| </li> |
| </ul> |
| |
| <h4 id="examples-4">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `count(*)` does not skip `NULL` values.</span> |
| <span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="mi">7</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| |
| <span class="c1">-- `NULL` values in column `age` are skipped from processing.</span> |
| <span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----------+</span> |
| <span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----------+</span> |
| <span class="o">|</span><span class="mi">5</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----------+</span> |
| |
| <span class="c1">-- `count(*)` on an empty input set returns 0. This is unlike the other</span> |
| <span class="c1">-- aggregate functions, such as `max`, which return `NULL`.</span> |
| <span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="mi">0</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| |
| <span class="c1">-- `NULL` values are excluded from computation of maximum value.</span> |
| <span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| |
| <span class="c1">-- `max` returns `NULL` on an empty input set.</span> |
| <span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span> |
| <span class="o">|</span><span class="k">null</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+</span></code></pre></figure> |
| |
| <h3 id="condition-expressions-in-where-having-and-join-clauses-">Condition expressions in WHERE, HAVING and JOIN clauses. <a name="condition-expressions"></a></h3> |
| <p><code class="highlighter-rouge">WHERE</code>, <code class="highlighter-rouge">HAVING</code> operators filter rows based on the user specified condition. |
| A <code class="highlighter-rouge">JOIN</code> operator is used to combine rows from two tables based on a join condition. |
| For all the three operators, a condition expression is a boolean expression and can return |
| <code>True, False or Unknown (NULL)</code>. They are “satisfied” if the result of the condition is <code class="highlighter-rouge">True</code>.</p> |
| |
| <h4 id="examples-5">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Persons whose age is unknown (`NULL`) are filtered out from the result set.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">></span> <span class="mi">0</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+---+</span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+---+</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+---+</span> |
| |
| <span class="c1">-- `IS NULL` expression is used in disjunction to select the persons</span> |
| <span class="c1">-- with unknown (`NULL`) records.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">></span> <span class="mi">0</span> <span class="k">OR</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| |
| <span class="c1">-- Person with unknown(`NULL`) ages are skipped from processing.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">HAVING</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="o">></span> <span class="mi">18</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">---+--------+ </span> |
| <span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+--------+</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+--------+</span> |
| |
| <span class="c1">-- A self join case with a join condition `p1.age = p2.age AND p1.name = p2.name`.</span> |
| <span class="c1">-- The persons with unknown age (`NULL`) are filtered out by the join operator.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span> |
| <span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span> |
| <span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+---+--------+---+</span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+---+--------+---+</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+---+--------+---+</span> |
| |
| <span class="c1">-- The age column from both legs of join are compared using null-safe equal which</span> |
| <span class="c1">-- is why the persons with unknown age (`NULL`) are qualified by the join.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span> |
| <span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o"><=></span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span> |
| <span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+----+--------+----+</span> |
| <span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+--------+----+</span> |
| <span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> |
| <span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> |
| <span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span> |
| <span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> |
| <span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+--------+----+</span></code></pre></figure> |
| |
| <h3 id="aggregate-operator-group-by-distinct-">Aggregate operator (GROUP BY, DISTINCT) <a name="aggregate-operator"></a></h3> |
| <p>As discussed in the previous section <a href="sql-ref-null-semantics.html#comparision-operators">comparison operator</a>, |
| two <code class="highlighter-rouge">NULL</code> values are not equal. However, for the purpose of grouping and distinct processing, the two or more |
| values with <code class="highlighter-rouge">NULL data</code>are grouped together into the same bucket. This behaviour is conformant with SQL |
| standard and with other enterprise database management systems.</p> |
| |
| <h4 id="examples-6">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `NULL` values are put in one bucket in `GROUP BY` processing.</span> |
| <span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----+--------+ </span> |
| <span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="mi">2</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="mi">1</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| |
| <span class="c1">-- All `NULL` ages are considered one distinct value in `DISTINCT` processing.</span> |
| <span class="k">SELECT</span> <span class="k">DISTINCT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----+</span> |
| <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+</span></code></pre></figure> |
| |
| <h3 id="sort-operator-order-by-clause-">Sort operator (ORDER BY Clause) <a name="order-by"></a></h3> |
| <p>Spark SQL supports null ordering specification in <code class="highlighter-rouge">ORDER BY</code> clause. Spark processes the <code class="highlighter-rouge">ORDER BY</code> clause by |
| placing all the <code class="highlighter-rouge">NULL</code> values at first or at last depending on the null ordering specification. By default, all |
| the <code class="highlighter-rouge">NULL</code> values are placed at first.</p> |
| |
| <h4 id="examples-7">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `NULL` values are shown at first and other values</span> |
| <span class="c1">-- are sorted in ascending way.</span> |
| <span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| |
| <span class="c1">-- Column values other than `NULL` are sorted in ascending</span> |
| <span class="c1">-- way and `NULL` values are shown at the last.</span> |
| <span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| |
| <span class="c1">-- Columns other than `NULL` values are sorted in descending</span> |
| <span class="c1">-- and `NULL` values are shown at the last.</span> |
| <span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">DESC</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span> |
| <span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+--------+</span></code></pre></figure> |
| |
| <h3 id="set-operators-union-intersect-except-">Set operators (UNION, INTERSECT, EXCEPT) <a name="set-operators"></a></h3> |
| <p><code class="highlighter-rouge">NULL</code> values are compared in a null-safe manner for equality in the context of |
| set operations. That means when comparing rows, two <code class="highlighter-rouge">NULL</code> values are considered |
| equal unlike the regular <code class="highlighter-rouge">EqualTo</code>(<code class="highlighter-rouge">=</code>) operator.</p> |
| |
| <h4 id="examples-8">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">VIEW</span> <span class="n">unknown_age</span> <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span> |
| |
| <span class="c1">-- Only common rows between two legs of `INTERSECT` are in the </span> |
| <span class="c1">-- result set. The comparison between columns of the row are done</span> |
| <span class="c1">-- in a null-safe manner.</span> |
| <span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span> |
| <span class="k">INTERSECT</span> |
| <span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">from</span> <span class="n">unknown_age</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">------+----+ </span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">------+----+</span> |
| <span class="o">|</span><span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">------+----+</span> |
| |
| <span class="c1">-- `NULL` values from two legs of the `EXCEPT` are not in output. </span> |
| <span class="c1">-- This basically shows that the comparison happens in a null-safe manner.</span> |
| <span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> |
| <span class="k">EXCEPT</span> |
| <span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">---+--------+ </span> |
| <span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="n">name</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+--------+</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> |
| <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span> |
| <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+--------+</span> |
| |
| <span class="c1">-- Performs `UNION` operation between two sets of data. </span> |
| <span class="c1">-- The comparison between columns of the row ae done in</span> |
| <span class="c1">-- null-safe manner.</span> |
| <span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span> |
| <span class="k">UNION</span> |
| <span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">--------+----+ </span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span></code></pre></figure> |
| |
| <h3 id="existsnot-exists-subquery-">EXISTS/NOT EXISTS Subquery <a name="exists-not-exists"></a></h3> |
| <p>In Spark, EXISTS and NOT EXISTS expressions are allowed inside a WHERE clause. |
| These are boolean expressions which return either <code class="highlighter-rouge">TRUE</code> or |
| <code class="highlighter-rouge">FALSE</code>. In other words, EXISTS is a membership condition and returns <code class="highlighter-rouge">TRUE</code> |
| when the subquery it refers to returns one or more rows. Similary, NOT EXISTS |
| is a non-membership condition and returns TRUE when no rows or zero rows are |
| returned from the subquery.</p> |
| |
| <p>These two expressions are not affected by presence of NULL in the result of |
| the subquery. They are normally faster because they can be converted to |
| semijoins / anti-semijoins without special provisions for null awareness.</p> |
| |
| <h4 id="examples-9">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Even if subquery produces rows with `NULL` values, the `EXISTS` expression</span> |
| <span class="c1">-- evaluates to `TRUE` as the subquery produces 1 row.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span> |
| <span class="o">+</span><span class="c1">--------+----+ </span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| |
| <span class="c1">-- `NOT EXISTS` expression returns `FALSE`. It returns `TRUE` only when</span> |
| <span class="c1">-- subquery produces no rows. In this case, it returns 1 row.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| |
| <span class="c1">-- `NOT EXISTS` expression returns `TRUE`.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="mi">1</span> <span class="k">WHERE</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">);</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span> |
| <span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span> |
| <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">--------+----+</span></code></pre></figure> |
| |
| <h3 id="innot-in-subquery-">IN/NOT IN Subquery <a name="in-not-in"></a></h3> |
| <p>In Spark, <code class="highlighter-rouge">IN</code> and <code class="highlighter-rouge">NOT IN</code> expressions are allowed inside a WHERE clause of |
| a query. Unlike the <code class="highlighter-rouge">EXISTS</code> expression, <code class="highlighter-rouge">IN</code> expression can return a <code class="highlighter-rouge">TRUE</code>, |
| <code class="highlighter-rouge">FALSE</code> or <code class="highlighter-rouge">UNKNOWN (NULL)</code> value. Conceptually a <code class="highlighter-rouge">IN</code> expression is semantically |
| equivalent to a set of equality condition separated by a disjunctive operator (<code class="highlighter-rouge">OR</code>). |
| For example, c1 IN (1, 2, 3) is semantically equivalent to <code class="highlighter-rouge">(C1 = 1 OR c1 = 2 OR c1 = 3)</code>.</p> |
| |
| <p>As far as handling <code class="highlighter-rouge">NULL</code> values are concerned, the semantics can be deduced from |
| the <code class="highlighter-rouge">NULL</code> value handling in comparison operators(<code class="highlighter-rouge">=</code>) and logical operators(<code class="highlighter-rouge">OR</code>). |
| To summarize, below are the rules for computing the result of an <code class="highlighter-rouge">IN</code> expression.</p> |
| |
| <ul> |
| <li>TRUE is returned when the non-NULL value in question is found in the list</li> |
| <li>FALSE is returned when the non-NULL value is not found in the list and the |
| list does not contain NULL values</li> |
| <li>UNKNOWN is returned when the value is <code class="highlighter-rouge">NULL</code>, or the non-NULL value is not found in the list |
| and the list contains at least one <code class="highlighter-rouge">NULL</code> value</li> |
| </ul> |
| |
| <p>NOT IN always returns UNKNOWN when the list contains <code class="highlighter-rouge">NULL</code>, regardless of the input value. |
| This is because IN returns UNKNOWN if the value is not in the list containing <code class="highlighter-rouge">NULL</code>, |
| and because NOT UNKNOWN is again UNKNOWN.</p> |
| |
| <h4 id="examples-10">Examples</h4> |
| |
| <figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- The subquery has only `NULL` value in its result set. Therefore,</span> |
| <span class="c1">-- the result of `IN` predicate is UNKNOWN.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| |
| <span class="c1">-- The subquery has `NULL` value in the result set as well as a valid </span> |
| <span class="c1">-- value `50`. Rows with age = 50 are returned. </span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> |
| <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">|</span><span class="n">Fred</span><span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| |
| <span class="c1">-- Since subquery has `NULL` value in the result set, the `NOT IN`</span> |
| <span class="c1">-- predicate would return UNKNOWN. Hence, no rows are</span> |
| <span class="c1">-- qualified for this query.</span> |
| <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> |
| <span class="k">WHERE</span> <span class="n">age</span> <span class="k">NOT</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">----+---+</span> |
| <span class="o">+</span><span class="c1">----+---+</span></code></pre></figure> |
| |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.4.1.min.js"></script> |
| <script src="js/vendor/bootstrap.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |