blob: f757c5cd61a12e3f155605af0767347b8dccaf25 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>TRANSFORM - Spark 3.3.0 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar">
<div class="container">
<div class="navbar-header">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.3.0</span>
</div>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.3.0</span></span>-->
</div>
</div>
</nav>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-old.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<ul>
<li>
<a href="sql-ref-ansi-compliance.html">
ANSI Compliance
</a>
</li>
<li>
<a href="sql-ref-datatypes.html">
Data Types
</a>
</li>
<li>
<a href="sql-ref-datetime-pattern.html">
Datetime Pattern
</a>
</li>
<li>
<a href="sql-ref-number-pattern.html">
Number Pattern
</a>
</li>
<li>
<a href="sql-ref-functions.html">
Functions
</a>
</li>
<li>
<a href="sql-ref-identifier.html">
Identifiers
</a>
</li>
<li>
<a href="sql-ref-literals.html">
Literals
</a>
</li>
<li>
<a href="sql-ref-null-semantics.html">
Null Semantics
</a>
</li>
<li>
<a href="sql-ref-syntax.html">
SQL Syntax
</a>
</li>
<ul>
<li>
<a href="sql-ref-syntax.html#ddl-statements">
Data Definition Statements
</a>
</li>
<li>
<a href="sql-ref-syntax.html#dml-statements">
Data Manipulation Statements
</a>
</li>
<li>
<a href="sql-ref-syntax.html#data-retrieval-statements">
Data Retrieval(Queries)
</a>
</li>
<li>
<a href="sql-ref-syntax.html#auxiliary-statements">
Auxiliary Statements
</a>
</li>
</ul>
</ul>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">TRANSFORM</h1>
<h3 id="description">Description</h3>
<p>The <code class="language-plaintext highlighter-rouge">TRANSFORM</code> clause is used to specify a Hive-style transform query specification
to transform the inputs by running a user-specified command or script.</p>
<p>Spark&#8217;s script transform supports two modes:</p>
<ol>
<li>Hive support disabled: Spark script transform can run with <code class="language-plaintext highlighter-rouge">spark.sql.catalogImplementation=in-memory</code>
or without <code class="language-plaintext highlighter-rouge">SparkSession.builder.enableHiveSupport()</code>. In this case, now Spark only uses the script transform with
<code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code> and treats all values passed to the script as strings.</li>
<li>Hive support enabled: When Spark is run with <code class="language-plaintext highlighter-rouge">spark.sql.catalogImplementation=hive</code> or Spark SQL is started
with <code class="language-plaintext highlighter-rouge">SparkSession.builder.enableHiveSupport()</code>, Spark can use the script transform with both Hive SerDe and
<code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code>.</li>
</ol>
<h3 id="syntax">Syntax</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">SELECT</span> <span class="k">TRANSFORM</span> <span class="p">(</span> <span class="n">expression</span> <span class="p">[</span> <span class="p">,</span> <span class="p">...</span> <span class="p">]</span> <span class="p">)</span>
<span class="p">[</span> <span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">row_format</span> <span class="p">]</span>
<span class="p">[</span> <span class="n">RECORDWRITER</span> <span class="n">record_writer_class</span> <span class="p">]</span>
<span class="k">USING</span> <span class="n">command_or_script</span> <span class="p">[</span> <span class="k">AS</span> <span class="p">(</span> <span class="p">[</span> <span class="n">col_name</span> <span class="p">[</span> <span class="n">col_type</span> <span class="p">]</span> <span class="p">]</span> <span class="p">[</span> <span class="p">,</span> <span class="p">...</span> <span class="p">]</span> <span class="p">)</span> <span class="p">]</span>
<span class="p">[</span> <span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">row_format</span> <span class="p">]</span>
<span class="p">[</span> <span class="n">RECORDREADER</span> <span class="n">record_reader_class</span> <span class="p">]</span>
</code></pre></div></div>
<h3 id="parameters">Parameters</h3>
<ul>
<li>
<p><strong>expression</strong></p>
<p>Specifies a combination of one or more values, operators and SQL functions that results in a value.</p>
</li>
<li>
<p><strong>row_format</strong></p>
<p>Specifies the row format for input and output. See <a href="sql-ref-syntax-hive-format.html">HIVE FORMAT</a> for more syntax details.</p>
</li>
<li>
<p><strong>RECORDWRITER</strong></p>
<p>Specifies a fully-qualified class name of a custom RecordWriter. The default value is <code class="language-plaintext highlighter-rouge">org.apache.hadoop.hive.ql.exec.TextRecordWriter</code>.</p>
</li>
<li>
<p><strong>RECORDREADER</strong></p>
<p>Specifies a fully-qualified class name of a custom RecordReader. The default value is <code class="language-plaintext highlighter-rouge">org.apache.hadoop.hive.ql.exec.TextRecordReader</code>.</p>
</li>
<li>
<p><strong>command_or_script</strong></p>
<p>Specifies a command or a path to script to process data.</p>
</li>
</ul>
<h3 id="row-format-delimited-behavior">ROW FORMAT DELIMITED BEHAVIOR</h3>
<p>When Spark uses <code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code> format:</p>
<ul>
<li>Spark uses the character <code class="language-plaintext highlighter-rouge">\u0001</code> as the default field delimiter and this delimiter can be overridden by <code class="language-plaintext highlighter-rouge">FIELDS TERMINATED BY</code>.</li>
<li>Spark uses the character <code class="language-plaintext highlighter-rouge">\n</code> as the default line delimiter and this delimiter can be overridden by <code class="language-plaintext highlighter-rouge">LINES TERMINATED BY</code>.</li>
<li>Spark uses a string <code class="language-plaintext highlighter-rouge">\N</code> as the default <code class="language-plaintext highlighter-rouge">NULL</code> value in order to differentiate <code class="language-plaintext highlighter-rouge">NULL</code> values
from the literal string <code class="language-plaintext highlighter-rouge">NULL</code>. This delimiter can be overridden by <code class="language-plaintext highlighter-rouge">NULL DEFINED AS</code>.</li>
<li>Spark casts all columns to <code class="language-plaintext highlighter-rouge">STRING</code> and combines columns by tabs before feeding to the user script.
For complex types such as <code class="language-plaintext highlighter-rouge">ARRAY</code>/<code class="language-plaintext highlighter-rouge">MAP</code>/<code class="language-plaintext highlighter-rouge">STRUCT</code>, Spark uses <code class="language-plaintext highlighter-rouge">to_json</code> casts it to an input <code class="language-plaintext highlighter-rouge">JSON</code> string and uses
<code class="language-plaintext highlighter-rouge">from_json</code> to convert the result output <code class="language-plaintext highlighter-rouge">JSON</code> string to <code class="language-plaintext highlighter-rouge">ARRAY</code>/<code class="language-plaintext highlighter-rouge">MAP</code>/<code class="language-plaintext highlighter-rouge">STRUCT</code> data.</li>
<li><code class="language-plaintext highlighter-rouge">COLLECTION ITEMS TERMINATED BY</code> and <code class="language-plaintext highlighter-rouge">MAP KEYS TERMINATED BY</code> are delimiters to split complex data such as
<code class="language-plaintext highlighter-rouge">ARRAY</code>/<code class="language-plaintext highlighter-rouge">MAP</code>/<code class="language-plaintext highlighter-rouge">STRUCT</code>, Spark uses <code class="language-plaintext highlighter-rouge">to_json</code> and <code class="language-plaintext highlighter-rouge">from_json</code> to handle complex data types with <code class="language-plaintext highlighter-rouge">JSON</code> format. So
<code class="language-plaintext highlighter-rouge">COLLECTION ITEMS TERMINATED BY</code> and <code class="language-plaintext highlighter-rouge">MAP KEYS TERMINATED BY</code> won&#8217;t work in default row format.</li>
<li>The standard output of the user script is treated as tab-separated <code class="language-plaintext highlighter-rouge">STRING</code> columns. Any cell containing only a string <code class="language-plaintext highlighter-rouge">\N</code>
is re-interpreted as a literal <code class="language-plaintext highlighter-rouge">NULL</code> value, and then the resulting <code class="language-plaintext highlighter-rouge">STRING</code> column will be cast to the data types specified in <code class="language-plaintext highlighter-rouge">col_type</code>.</li>
<li>If the actual number of output columns is less than the number of specified output columns,
additional output columns will be filled with <code class="language-plaintext highlighter-rouge">NULL</code>. For example:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> output tabs: 1, 2
output columns: A: INT, B INT, C: INT
result:
+---+---+------+
| a| b| c|
+---+---+------+
| 1| 2| NULL|
+---+---+------+
</code></pre></div> </div>
</li>
<li>If the actual number of output columns is more than the number of specified output columns,
the output columns only select the corresponding columns, and the remaining part will be discarded.
For example, if the output has three tabs and there are only two output columns:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> output tabs: 1, 2, 3
output columns: A: INT, B INT
result:
+---+---+
| a| b|
+---+---+
| 1| 2|
+---+---+
</code></pre></div> </div>
</li>
<li>If there is no <code class="language-plaintext highlighter-rouge">AS</code> clause after <code class="language-plaintext highlighter-rouge">USING my_script</code>, the output schema is <code class="language-plaintext highlighter-rouge">key: STRING, value: STRING</code>.
The <code class="language-plaintext highlighter-rouge">key</code> column contains all the characters before the first tab and the <code class="language-plaintext highlighter-rouge">value</code> column contains the remaining characters after the first tab.
If there are no tabs, Spark returns the <code class="language-plaintext highlighter-rouge">NULL</code> value. For example:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code> output tabs: 1, 2, 3
output columns:
result:
+-----+-------+
| key| value|
+-----+-------+
| 1| 2|
+-----+-------+
output tabs: 1, 2
output columns:
result:
+-----+-------+
| key| value|
+-----+-------+
| 1| NULL|
+-----+-------+
</code></pre></div> </div>
</li>
</ul>
<h3 id="hive-serde-behavior">Hive SerDe behavior</h3>
<p>When Hive support is enabled and Hive SerDe mode is used:</p>
<ul>
<li>Spark uses the Hive SerDe <code class="language-plaintext highlighter-rouge">org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe</code> by default, so columns are cast
to <code class="language-plaintext highlighter-rouge">STRING</code> and combined by tabs before feeding to the user script.</li>
<li>All literal <code class="language-plaintext highlighter-rouge">NULL</code> values are converted to a string <code class="language-plaintext highlighter-rouge">\N</code> in order to differentiate literal <code class="language-plaintext highlighter-rouge">NULL</code> values from the literal string <code class="language-plaintext highlighter-rouge">NULL</code>.</li>
<li>The standard output of the user script is treated as tab-separated <code class="language-plaintext highlighter-rouge">STRING</code> columns, any cell containing only a string <code class="language-plaintext highlighter-rouge">\N</code> is re-interpreted
as a <code class="language-plaintext highlighter-rouge">NULL</code> value, and then the resulting STRING column will be cast to the data type specified in <code class="language-plaintext highlighter-rouge">col_type</code>.</li>
<li>If the actual number of output columns is less than the number of specified output columns,
additional output columns will be filled with <code class="language-plaintext highlighter-rouge">NULL</code>.</li>
<li>If the actual number of output columns is more than the number of specified output columns,
the output columns only select the corresponding columns, and the remaining part will be discarded.</li>
<li>If there is no <code class="language-plaintext highlighter-rouge">AS</code> clause after <code class="language-plaintext highlighter-rouge">USING my_script</code>, the output schema is <code class="language-plaintext highlighter-rouge">key: STRING, value: STRING</code>.
The <code class="language-plaintext highlighter-rouge">key</code> column contains all the characters before the first tab and the <code class="language-plaintext highlighter-rouge">value</code> column contains the remaining characters after the first tab.
If there is no tab, Spark returns the <code class="language-plaintext highlighter-rouge">NULL</code> value.</li>
<li>These defaults can be overridden with <code class="language-plaintext highlighter-rouge">ROW FORMAT SERDE</code> or <code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code>.</li>
</ul>
<h3 id="examples">Examples</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">person</span> <span class="p">(</span><span class="n">zip_code</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">);</span>
<span class="k">INSERT</span> <span class="k">INTO</span> <span class="n">person</span> <span class="k">VALUES</span>
<span class="p">(</span><span class="mi">94588</span><span class="p">,</span> <span class="s1">'Zen Hui'</span><span class="p">,</span> <span class="mi">50</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94588</span><span class="p">,</span> <span class="s1">'Dan Li'</span><span class="p">,</span> <span class="mi">18</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94588</span><span class="p">,</span> <span class="s1">'Anil K'</span><span class="p">,</span> <span class="mi">27</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94588</span><span class="p">,</span> <span class="s1">'John V'</span><span class="p">,</span> <span class="k">NULL</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94511</span><span class="p">,</span> <span class="s1">'David K'</span><span class="p">,</span> <span class="mi">42</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94511</span><span class="p">,</span> <span class="s1">'Aryan B.'</span><span class="p">,</span> <span class="mi">18</span><span class="p">),</span>
<span class="p">(</span><span class="mi">94511</span><span class="p">,</span> <span class="s1">'Lalit B.'</span><span class="p">,</span> <span class="k">NULL</span><span class="p">);</span>
<span class="c1">-- With specified output without data type</span>
<span class="k">SELECT</span> <span class="k">TRANSFORM</span><span class="p">(</span><span class="n">zip_code</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span><span class="p">)</span>
<span class="k">USING</span> <span class="s1">'cat'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="k">c</span><span class="p">)</span>
<span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">zip_code</span> <span class="o">&gt;</span> <span class="mi">94511</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="n">a</span> <span class="o">|</span> <span class="n">b</span><span class="o">|</span> <span class="k">c</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Anil</span> <span class="n">K</span><span class="o">|</span> <span class="mi">27</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">John</span> <span class="n">V</span><span class="o">|</span> <span class="k">NULL</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Zen</span> <span class="n">Hui</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Dan</span> <span class="n">Li</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="c1">-- With specified output with data type</span>
<span class="k">SELECT</span> <span class="k">TRANSFORM</span><span class="p">(</span><span class="n">zip_code</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span><span class="p">)</span>
<span class="k">USING</span> <span class="s1">'cat'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">a</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">b</span> <span class="n">STRING</span><span class="p">,</span> <span class="k">c</span> <span class="n">STRING</span><span class="p">)</span>
<span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">zip_code</span> <span class="o">&gt;</span> <span class="mi">94511</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="n">a</span> <span class="o">|</span> <span class="n">b</span><span class="o">|</span> <span class="k">c</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Anil</span> <span class="n">K</span><span class="o">|</span> <span class="mi">27</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">John</span> <span class="n">V</span><span class="o">|</span> <span class="k">NULL</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Zen</span> <span class="n">Hui</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Dan</span> <span class="n">Li</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="c1">-- Using ROW FORMAT DELIMITED</span>
<span class="k">SELECT</span> <span class="k">TRANSFORM</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">age</span><span class="p">)</span>
<span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">DELIMITED</span>
<span class="n">FIELDS</span> <span class="n">TERMINATED</span> <span class="k">BY</span> <span class="s1">','</span>
<span class="n">LINES</span> <span class="n">TERMINATED</span> <span class="k">BY</span> <span class="s1">'</span><span class="se">\n</span><span class="s1">'</span>
<span class="k">NULL</span> <span class="k">DEFINED</span> <span class="k">AS</span> <span class="s1">'NULL'</span>
<span class="k">USING</span> <span class="s1">'cat'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">name_age</span> <span class="n">string</span><span class="p">)</span>
<span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">DELIMITED</span>
<span class="n">FIELDS</span> <span class="n">TERMINATED</span> <span class="k">BY</span> <span class="s1">'@'</span>
<span class="n">LINES</span> <span class="n">TERMINATED</span> <span class="k">BY</span> <span class="s1">'</span><span class="se">\n</span><span class="s1">'</span>
<span class="k">NULL</span> <span class="k">DEFINED</span> <span class="k">AS</span> <span class="s1">'NULL'</span>
<span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">---------------+</span>
<span class="o">|</span> <span class="n">name_age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---------------+</span>
<span class="o">|</span> <span class="n">Anil</span> <span class="n">K</span><span class="p">,</span><span class="mi">27</span><span class="o">|</span>
<span class="o">|</span> <span class="n">John</span> <span class="n">V</span><span class="p">,</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">ryan</span> <span class="n">B</span><span class="p">.,</span><span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">David</span> <span class="n">K</span><span class="p">,</span><span class="mi">42</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Zen</span> <span class="n">Hui</span><span class="p">,</span><span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span> <span class="n">Li</span><span class="p">,</span><span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Lalit</span> <span class="n">B</span><span class="p">.,</span><span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---------------+</span>
<span class="c1">-- Using Hive Serde</span>
<span class="k">SELECT</span> <span class="k">TRANSFORM</span><span class="p">(</span><span class="n">zip_code</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span><span class="p">)</span>
<span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">SERDE</span> <span class="s1">'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'</span>
<span class="k">WITH</span> <span class="n">SERDEPROPERTIES</span> <span class="p">(</span>
<span class="s1">'field.delim'</span> <span class="o">=</span> <span class="s1">'</span><span class="se">\t</span><span class="s1">'</span>
<span class="p">)</span>
<span class="k">USING</span> <span class="s1">'cat'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">a</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">b</span> <span class="n">STRING</span><span class="p">,</span> <span class="k">c</span> <span class="n">STRING</span><span class="p">)</span>
<span class="k">ROW</span> <span class="n">FORMAT</span> <span class="n">SERDE</span> <span class="s1">'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'</span>
<span class="k">WITH</span> <span class="n">SERDEPROPERTIES</span> <span class="p">(</span>
<span class="s1">'field.delim'</span> <span class="o">=</span> <span class="s1">'</span><span class="se">\t</span><span class="s1">'</span>
<span class="p">)</span>
<span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">zip_code</span> <span class="o">&gt;</span> <span class="mi">94511</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="n">a</span> <span class="o">|</span> <span class="n">b</span><span class="o">|</span> <span class="k">c</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Anil</span> <span class="n">K</span><span class="o">|</span> <span class="mi">27</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">John</span> <span class="n">V</span><span class="o">|</span> <span class="k">NULL</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Zen</span> <span class="n">Hui</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Dan</span> <span class="n">Li</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------+-----+</span>
<span class="c1">-- Schema-less mode</span>
<span class="k">SELECT</span> <span class="k">TRANSFORM</span><span class="p">(</span><span class="n">zip_code</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span><span class="p">)</span>
<span class="k">USING</span> <span class="s1">'cat'</span>
<span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">zip_code</span> <span class="o">&gt;</span> <span class="mi">94500</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-------+---------------------+</span>
<span class="o">|</span> <span class="k">key</span><span class="o">|</span> <span class="n">value</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------------------+</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Anil</span> <span class="n">K</span> <span class="mi">27</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">John</span> <span class="n">V</span> <span class="err">\</span><span class="n">N</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94511</span><span class="o">|</span> <span class="n">Aryan</span> <span class="n">B</span><span class="p">.</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94511</span><span class="o">|</span> <span class="n">David</span> <span class="n">K</span> <span class="mi">42</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Zen</span> <span class="n">Hui</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94588</span><span class="o">|</span> <span class="n">Dan</span> <span class="n">Li</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">94511</span><span class="o">|</span> <span class="n">Lalit</span> <span class="n">B</span><span class="p">.</span> <span class="err">\</span><span class="n">N</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-------+---------------------+</span>
</code></pre></div></div>
<h3 id="related-statements">Related Statements</h3>
<ul>
<li><a href="sql-ref-syntax-qry-select.html">SELECT Main</a></li>
<li><a href="sql-ref-syntax-qry-select-where.html">WHERE Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-groupby.html">GROUP BY Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-having.html">HAVING Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-orderby.html">ORDER BY Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-sortby.html">SORT BY Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-distribute-by.html">DISTRIBUTE BY Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-limit.html">LIMIT Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-case.html">CASE Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-pivot.html">PIVOT Clause</a></li>
<li><a href="sql-ref-syntax-qry-select-lateral-view.html">LATERAL VIEW Clause</a></li>
</ul>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.3.0"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>