blob: ff4e905ba301a62c6ed1e4bf6dd4e4267768e567 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>Scalar User Defined Functions (UDFs) - Spark 3.0.2 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/bootstrap-responsive.min.css">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<!-- Google analytics script -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-32518208-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<div class="navbar navbar-fixed-top" id="topbar">
<div class="navbar-inner">
<div class="container">
<div class="brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.0.2</span>
</div>
<ul class="nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li><a href="index.html">Overview</a></li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="quick-start.html">Quick Start</a></li>
<li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li>
<li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li>
<li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li>
<li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li>
<li><a href="ml-guide.html">MLlib (Machine Learning)</a></li>
<li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
<li><a href="sparkr.html">SparkR (R on Spark)</a></li>
</ul>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="api/scala/org/apache/spark/index.html">Scala</a></li>
<li><a href="api/java/index.html">Java</a></li>
<li><a href="api/python/index.html">Python</a></li>
<li><a href="api/R/index.html">R</a></li>
<li><a href="api/sql/index.html">SQL, Built-in Functions</a></li>
</ul>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="cluster-overview.html">Overview</a></li>
<li><a href="submitting-applications.html">Submitting Applications</a></li>
<li class="divider"></li>
<li><a href="spark-standalone.html">Spark Standalone</a></li>
<li><a href="running-on-mesos.html">Mesos</a></li>
<li><a href="running-on-yarn.html">YARN</a></li>
<li><a href="running-on-kubernetes.html">Kubernetes</a></li>
</ul>
</li>
<li class="dropdown">
<a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="configuration.html">Configuration</a></li>
<li><a href="monitoring.html">Monitoring</a></li>
<li><a href="tuning.html">Tuning Guide</a></li>
<li><a href="job-scheduling.html">Job Scheduling</a></li>
<li><a href="security.html">Security</a></li>
<li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
<li><a href="migration-guide.html">Migration Guide</a></li>
<li class="divider"></li>
<li><a href="building-spark.html">Building Spark</a></li>
<li><a href="https://spark.apache.org/contributing.html">Contributing to Spark</a></li>
<li><a href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a></li>
</ul>
</li>
</ul>
<!--<p class="navbar-text pull-right"><span class="version-text">v3.0.2</span></p>-->
</div>
</div>
</div>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-old.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<ul>
<li>
<a href="sql-ref-ansi-compliance.html">
ANSI Compliance
</a>
</li>
<li>
<a href="sql-ref-datatypes.html">
Data Types
</a>
</li>
<li>
<a href="sql-ref-datetime-pattern.html">
Datetime Pattern
</a>
</li>
<li>
<a href="sql-ref-functions.html">
Functions
</a>
</li>
<ul>
<li>
<a href="sql-ref-functions-builtin.html">
Built-in Functions
</a>
</li>
<li>
<a href="sql-ref-functions-udf-scalar.html">
<b>Scalar UDFs (User-Defined Functions)</b>
</a>
</li>
<li>
<a href="sql-ref-functions-udf-aggregate.html">
UDAFs (User-Defined Aggregate Functions)
</a>
</li>
<li>
<a href="sql-ref-functions-udf-hive.html">
Integration with Hive UDFs/UDAFs/UDTFs
</a>
</li>
</ul>
<li>
<a href="sql-ref-identifier.html">
Identifiers
</a>
</li>
<li>
<a href="sql-ref-literals.html">
Literals
</a>
</li>
<li>
<a href="sql-ref-null-semantics.html">
Null Semantics
</a>
</li>
<li>
<a href="sql-ref-syntax.html">
SQL Syntax
</a>
</li>
</ul>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar" id="content">
<h1 class="title">Scalar User Defined Functions (UDFs)</h1>
<h3 id="description">Description</h3>
<p>User-Defined Functions (UDFs) are user-programmable routines that act on one row. This documentation lists the classes that are required for creating and registering UDFs. It also contains examples that demonstrate how to define and register UDFs and invoke them in Spark SQL.</p>
<h3 id="userdefinedfunction">UserDefinedFunction</h3>
<p>To define the properties of a user-defined function, the user can use some of the methods defined in this class.</p>
<ul>
<li>
<p><strong>asNonNullable(): UserDefinedFunction</strong></p>
<p>Updates UserDefinedFunction to non-nullable.</p>
</li>
<li>
<p><strong>asNondeterministic(): UserDefinedFunction</strong></p>
<p>Updates UserDefinedFunction to nondeterministic.</p>
</li>
<li>
<p><strong>withName(name: String): UserDefinedFunction</strong></p>
<p>Updates UserDefinedFunction with a given name.</p>
</li>
</ul>
<h3 id="examples">Examples</h3>
<div class="codetabs">
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.functions.udf</span>
<span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nc">SparkSession</span>
<span class="o">.</span><span class="py">builder</span><span class="o">()</span>
<span class="o">.</span><span class="py">appName</span><span class="o">(</span><span class="s">"Spark SQL UDF scalar example"</span><span class="o">)</span>
<span class="o">.</span><span class="py">getOrCreate</span><span class="o">()</span>
<span class="c1">// Define and register a zero-argument non-deterministic UDF</span>
<span class="c1">// UDF is deterministic by default, i.e. produces the same result for the same input.</span>
<span class="k">val</span> <span class="nv">random</span> <span class="k">=</span> <span class="nf">udf</span><span class="o">(()</span> <span class="k">=&gt;</span> <span class="nv">Math</span><span class="o">.</span><span class="py">random</span><span class="o">())</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">udf</span><span class="o">.</span><span class="py">register</span><span class="o">(</span><span class="s">"random"</span><span class="o">,</span> <span class="nv">random</span><span class="o">.</span><span class="py">asNondeterministic</span><span class="o">())</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT random()"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------+</span>
<span class="c1">// |UDF() |</span>
<span class="c1">// +-------+</span>
<span class="c1">// |xxxxxxx|</span>
<span class="c1">// +-------+</span>
<span class="c1">// Define and register a one-argument UDF</span>
<span class="k">val</span> <span class="nv">plusOne</span> <span class="k">=</span> <span class="nf">udf</span><span class="o">((</span><span class="n">x</span><span class="k">:</span> <span class="kt">Int</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span><span class="o">)</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">udf</span><span class="o">.</span><span class="py">register</span><span class="o">(</span><span class="s">"plusOne"</span><span class="o">,</span> <span class="n">plusOne</span><span class="o">)</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT plusOne(5)"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +------+</span>
<span class="c1">// |UDF(5)|</span>
<span class="c1">// +------+</span>
<span class="c1">// | 6|</span>
<span class="c1">// +------+</span>
<span class="c1">// Define a two-argument UDF and register it with Spark in one step</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">udf</span><span class="o">.</span><span class="py">register</span><span class="o">(</span><span class="s">"strLenScala"</span><span class="o">,</span> <span class="o">(</span><span class="k">_:</span> <span class="kt">String</span><span class="o">).</span><span class="py">length</span> <span class="o">+</span> <span class="o">(</span><span class="k">_:</span> <span class="kt">Int</span><span class="o">))</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT strLenScala('test', 1)"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +--------------------+</span>
<span class="c1">// |strLenScala(test, 1)|</span>
<span class="c1">// +--------------------+</span>
<span class="c1">// | 5|</span>
<span class="c1">// +--------------------+</span>
<span class="c1">// UDF in a WHERE clause</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">udf</span><span class="o">.</span><span class="py">register</span><span class="o">(</span><span class="s">"oneArgFilter"</span><span class="o">,</span> <span class="o">(</span><span class="n">n</span><span class="k">:</span> <span class="kt">Int</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="o">{</span> <span class="n">n</span> <span class="o">&gt;</span> <span class="mi">5</span> <span class="o">})</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mi">10</span><span class="o">).</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"test"</span><span class="o">)</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT * FROM test WHERE oneArgFilter(id)"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +---+</span>
<span class="c1">// | id|</span>
<span class="c1">// +---+</span>
<span class="c1">// | 6|</span>
<span class="c1">// | 7|</span>
<span class="c1">// | 8|</span>
<span class="c1">// | 9|</span>
<span class="c1">// +---+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedScalar.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.sql.*</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.api.java.UDF1</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.expressions.UserDefinedFunction</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">functions</span><span class="o">.</span><span class="na">udf</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
<span class="nc">SparkSession</span> <span class="n">spark</span> <span class="o">=</span> <span class="nc">SparkSession</span>
<span class="o">.</span><span class="na">builder</span><span class="o">()</span>
<span class="o">.</span><span class="na">appName</span><span class="o">(</span><span class="s">"Java Spark SQL UDF scalar example"</span><span class="o">)</span>
<span class="o">.</span><span class="na">getOrCreate</span><span class="o">();</span>
<span class="c1">// Define and register a zero-argument non-deterministic UDF</span>
<span class="c1">// UDF is deterministic by default, i.e. produces the same result for the same input.</span>
<span class="nc">UserDefinedFunction</span> <span class="n">random</span> <span class="o">=</span> <span class="n">udf</span><span class="o">(</span>
<span class="o">()</span> <span class="o">-&gt;</span> <span class="nc">Math</span><span class="o">.</span><span class="na">random</span><span class="o">(),</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span>
<span class="o">);</span>
<span class="n">random</span><span class="o">.</span><span class="na">asNondeterministic</span><span class="o">();</span>
<span class="n">spark</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">"random"</span><span class="o">,</span> <span class="n">random</span><span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT random()"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------+</span>
<span class="c1">// |UDF() |</span>
<span class="c1">// +-------+</span>
<span class="c1">// |xxxxxxx|</span>
<span class="c1">// +-------+</span>
<span class="c1">// Define and register a one-argument UDF</span>
<span class="n">spark</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">"plusOne"</span><span class="o">,</span> <span class="k">new</span> <span class="no">UDF1</span><span class="o">&lt;</span><span class="nc">Integer</span><span class="o">,</span> <span class="nc">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
<span class="nd">@Override</span>
<span class="kd">public</span> <span class="nc">Integer</span> <span class="nf">call</span><span class="o">(</span><span class="nc">Integer</span> <span class="n">x</span><span class="o">)</span> <span class="o">{</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span><span class="o">;</span>
<span class="o">}</span>
<span class="o">},</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT plusOne(5)"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----------+</span>
<span class="c1">// |plusOne(5)|</span>
<span class="c1">// +----------+</span>
<span class="c1">// | 6|</span>
<span class="c1">// +----------+</span>
<span class="c1">// Define and register a two-argument UDF</span>
<span class="nc">UserDefinedFunction</span> <span class="n">strLen</span> <span class="o">=</span> <span class="n">udf</span><span class="o">(</span>
<span class="o">(</span><span class="nc">String</span> <span class="n">s</span><span class="o">,</span> <span class="nc">Integer</span> <span class="n">x</span><span class="o">)</span> <span class="o">-&gt;</span> <span class="n">s</span><span class="o">.</span><span class="na">length</span><span class="o">()</span> <span class="o">+</span> <span class="n">x</span><span class="o">,</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span>
<span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">"strLen"</span><span class="o">,</span> <span class="n">strLen</span><span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT strLen('test', 1)"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +------------+</span>
<span class="c1">// |UDF(test, 1)|</span>
<span class="c1">// +------------+</span>
<span class="c1">// | 5|</span>
<span class="c1">// +------------+</span>
<span class="c1">// UDF in a WHERE clause</span>
<span class="n">spark</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">"oneArgFilter"</span><span class="o">,</span> <span class="k">new</span> <span class="no">UDF1</span><span class="o">&lt;</span><span class="nc">Long</span><span class="o">,</span> <span class="nc">Boolean</span><span class="o">&gt;()</span> <span class="o">{</span>
<span class="nd">@Override</span>
<span class="kd">public</span> <span class="nc">Boolean</span> <span class="nf">call</span><span class="o">(</span><span class="nc">Long</span> <span class="n">x</span><span class="o">)</span> <span class="o">{</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">&gt;</span> <span class="mi">5</span><span class="o">;</span>
<span class="o">}</span>
<span class="o">},</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">BooleanType</span><span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">range</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mi">10</span><span class="o">).</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"test"</span><span class="o">);</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM test WHERE oneArgFilter(id)"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +---+</span>
<span class="c1">// | id|</span>
<span class="c1">// +---+</span>
<span class="c1">// | 6|</span>
<span class="c1">// | 7|</span>
<span class="c1">// | 8|</span>
<span class="c1">// | 9|</span>
<span class="c1">// +---+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedScalar.java" in the Spark repo.</small></div>
</div>
</div>
<h3 id="related-statements">Related Statements</h3>
<ul>
<li><a href="sql-ref-functions-udf-aggregate.html">User Defined Aggregate Functions (UDAFs)</a></li>
<li><a href="sql-ref-functions-udf-hive.html">Integration with Hive UDFs/UDAFs/UDTFs</a></li>
</ul>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.4.1.min.js"></script>
<script src="js/vendor/bootstrap.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>