blob: 04dc73b81c1f841d15981f16637d84effdf26ee1 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>Getting Started - Spark 3.2.0 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Google analytics script -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-32518208-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar">
<div class="container">
<div class="navbar-header">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.2.0</span>
</div>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.2.0</span></span>-->
</div>
</div>
</nav>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
<b>Getting Started</b>
</a>
</li>
<ul>
<li>
<a href="sql-getting-started.html#starting-point-sparksession">
Starting Point: SparkSession
</a>
</li>
<li>
<a href="sql-getting-started.html#creating-dataframes">
Creating DataFrames
</a>
</li>
<li>
<a href="sql-getting-started.html#untyped-dataset-operations-aka-dataframe-operations">
Untyped Dataset Operations (DataFrame operations)
</a>
</li>
<li>
<a href="sql-getting-started.html#running-sql-queries-programmatically">
Running SQL Queries Programmatically
</a>
</li>
<li>
<a href="sql-getting-started.html#global-temporary-view">
Global Temporary View
</a>
</li>
<li>
<a href="sql-getting-started.html#creating-datasets">
Creating Datasets
</a>
</li>
<li>
<a href="sql-getting-started.html#interoperating-with-rdds">
Interoperating with RDDs
</a>
</li>
<li>
<a href="sql-getting-started.html#scalar-functions">
Scalar Functions
</a>
</li>
<li>
<a href="sql-getting-started.html#aggregations">
Aggregations
</a>
</li>
</ul>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-old.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Getting Started</h1>
<ul id="markdown-toc">
<li><a href="#starting-point-sparksession" id="markdown-toc-starting-point-sparksession">Starting Point: SparkSession</a></li>
<li><a href="#creating-dataframes" id="markdown-toc-creating-dataframes">Creating DataFrames</a></li>
<li><a href="#untyped-dataset-operations-aka-dataframe-operations" id="markdown-toc-untyped-dataset-operations-aka-dataframe-operations">Untyped Dataset Operations (aka DataFrame Operations)</a></li>
<li><a href="#running-sql-queries-programmatically" id="markdown-toc-running-sql-queries-programmatically">Running SQL Queries Programmatically</a></li>
<li><a href="#global-temporary-view" id="markdown-toc-global-temporary-view">Global Temporary View</a></li>
<li><a href="#creating-datasets" id="markdown-toc-creating-datasets">Creating Datasets</a></li>
<li><a href="#interoperating-with-rdds" id="markdown-toc-interoperating-with-rdds">Interoperating with RDDs</a> <ul>
<li><a href="#inferring-the-schema-using-reflection" id="markdown-toc-inferring-the-schema-using-reflection">Inferring the Schema Using Reflection</a></li>
<li><a href="#programmatically-specifying-the-schema" id="markdown-toc-programmatically-specifying-the-schema">Programmatically Specifying the Schema</a></li>
</ul>
</li>
<li><a href="#scalar-functions" id="markdown-toc-scalar-functions">Scalar Functions</a></li>
<li><a href="#aggregate-functions" id="markdown-toc-aggregate-functions">Aggregate Functions</a></li>
</ul>
<h2 id="starting-point-sparksession">Starting Point: SparkSession</h2>
<div class="codetabs">
<div data-lang="scala">
<p>The entry point into all functionality in Spark is the <a href="api/scala/org/apache/spark/sql/SparkSession.html"><code class="language-plaintext highlighter-rouge">SparkSession</code></a> class. To create a basic <code class="language-plaintext highlighter-rouge">SparkSession</code>, just use <code class="language-plaintext highlighter-rouge">SparkSession.builder()</code>:</p>
<div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span>
<span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nc">SparkSession</span>
<span class="o">.</span><span class="py">builder</span><span class="o">()</span>
<span class="o">.</span><span class="py">appName</span><span class="o">(</span><span class="s">"Spark SQL basic example"</span><span class="o">)</span>
<span class="o">.</span><span class="py">config</span><span class="o">(</span><span class="s">"spark.some.config.option"</span><span class="o">,</span> <span class="s">"some-value"</span><span class="o">)</span>
<span class="o">.</span><span class="py">getOrCreate</span><span class="o">()</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<p>The entry point into all functionality in Spark is the <a href="api/java/index.html#org.apache.spark.sql.SparkSession"><code class="language-plaintext highlighter-rouge">SparkSession</code></a> class. To create a basic <code class="language-plaintext highlighter-rouge">SparkSession</code>, just use <code class="language-plaintext highlighter-rouge">SparkSession.builder()</code>:</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span>
<span class="nc">SparkSession</span> <span class="n">spark</span> <span class="o">=</span> <span class="nc">SparkSession</span>
<span class="o">.</span><span class="na">builder</span><span class="o">()</span>
<span class="o">.</span><span class="na">appName</span><span class="o">(</span><span class="s">"Java Spark SQL basic example"</span><span class="o">)</span>
<span class="o">.</span><span class="na">config</span><span class="o">(</span><span class="s">"spark.some.config.option"</span><span class="o">,</span> <span class="s">"some-value"</span><span class="o">)</span>
<span class="o">.</span><span class="na">getOrCreate</span><span class="o">();</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<p>The entry point into all functionality in Spark is the <a href="api/python/reference/api/pyspark.sql.SparkSession.html"><code class="language-plaintext highlighter-rouge">SparkSession</code></a> class. To create a basic <code class="language-plaintext highlighter-rouge">SparkSession</code>, just use <code class="language-plaintext highlighter-rouge">SparkSession.builder</code>:</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span> \
<span class="p">.</span><span class="n">builder</span> \
<span class="p">.</span><span class="n">appName</span><span class="p">(</span><span class="s">"Python Spark SQL basic example"</span><span class="p">)</span> \
<span class="p">.</span><span class="n">config</span><span class="p">(</span><span class="s">"spark.some.config.option"</span><span class="p">,</span> <span class="s">"some-value"</span><span class="p">)</span> \
<span class="p">.</span><span class="n">getOrCreate</span><span class="p">()</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<p>The entry point into all functionality in Spark is the <a href="api/R/sparkR.session.html"><code class="language-plaintext highlighter-rouge">SparkSession</code></a> class. To initialize a basic <code class="language-plaintext highlighter-rouge">SparkSession</code>, just call <code class="language-plaintext highlighter-rouge">sparkR.session()</code>:</p>
<div class="highlight"><pre class="codehilite"><code><span class="n">sparkR.session</span><span class="p">(</span><span class="n">appName</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"R Spark SQL basic example"</span><span class="p">,</span><span class="w"> </span><span class="n">sparkConfig</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nf">list</span><span class="p">(</span><span class="n">spark.some.config.option</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"some-value"</span><span class="p">))</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
<p>Note that when invoked for the first time, <code class="language-plaintext highlighter-rouge">sparkR.session()</code> initializes a global <code class="language-plaintext highlighter-rouge">SparkSession</code> singleton instance, and always returns a reference to this instance for successive invocations. In this way, users only need to initialize the <code class="language-plaintext highlighter-rouge">SparkSession</code> once, then SparkR functions like <code class="language-plaintext highlighter-rouge">read.df</code> will be able to access this global instance implicitly, and users don&#8217;t need to pass the <code class="language-plaintext highlighter-rouge">SparkSession</code> instance around.</p>
</div>
</div>
<p><code class="language-plaintext highlighter-rouge">SparkSession</code> in Spark 2.0 provides builtin support for Hive features including the ability to
write queries using HiveQL, access to Hive UDFs, and the ability to read data from Hive tables.
To use these features, you do not need to have an existing Hive setup.</p>
<h2 id="creating-dataframes">Creating DataFrames</h2>
<div class="codetabs">
<div data-lang="scala">
<p>With a <code class="language-plaintext highlighter-rouge">SparkSession</code>, applications can create DataFrames from an <a href="#interoperating-with-rdds">existing <code class="language-plaintext highlighter-rouge">RDD</code></a>,
from a Hive table, or from <a href="sql-data-sources.html">Spark data sources</a>.</p>
<p>As an example, the following creates a DataFrame based on the content of a JSON file:</p>
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">df</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">json</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">)</span>
<span class="c1">// Displays the content of the DataFrame to stdout</span>
<span class="nv">df</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<p>With a <code class="language-plaintext highlighter-rouge">SparkSession</code>, applications can create DataFrames from an <a href="#interoperating-with-rdds">existing <code class="language-plaintext highlighter-rouge">RDD</code></a>,
from a Hive table, or from <a href="sql-data-sources.html">Spark data sources</a>.</p>
<p>As an example, the following creates a DataFrame based on the content of a JSON file:</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">);</span>
<span class="c1">// Displays the content of the DataFrame to stdout</span>
<span class="n">df</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<p>With a <code class="language-plaintext highlighter-rouge">SparkSession</code>, applications can create DataFrames from an <a href="#interoperating-with-rdds">existing <code class="language-plaintext highlighter-rouge">RDD</code></a>,
from a Hive table, or from <a href="sql-data-sources.html">Spark data sources</a>.</p>
<p>As an example, the following creates a DataFrame based on the content of a JSON file:</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1"># spark is an existing SparkSession
</span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">json</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="p">)</span>
<span class="c1"># Displays the content of the DataFrame to stdout
</span><span class="n">df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----+-------+
# | age| name|
# +----+-------+
# |null|Michael|
# | 30| Andy|
# | 19| Justin|
# +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<p>With a <code class="language-plaintext highlighter-rouge">SparkSession</code>, applications can create DataFrames from a local R data.frame,
from a Hive table, or from <a href="sql-data-sources.html">Spark data sources</a>.</p>
<p>As an example, the following creates a DataFrame based on the content of a JSON file:</p>
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.json</span><span class="p">(</span><span class="s2">"examples/src/main/resources/people.json"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Displays the content of the DataFrame</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="w">
</span><span class="c1">## age name</span><span class="w">
</span><span class="c1">## 1 NA Michael</span><span class="w">
</span><span class="c1">## 2 30 Andy</span><span class="w">
</span><span class="c1">## 3 19 Justin</span><span class="w">
</span><span class="c1"># Another method to print the first few rows and optionally truncate the printing of long values</span><span class="w">
</span><span class="n">showDF</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="w">
</span><span class="c1">## +----+-------+</span><span class="w">
</span><span class="c1">## | age| name|</span><span class="w">
</span><span class="c1">## +----+-------+</span><span class="w">
</span><span class="c1">## |null|Michael|</span><span class="w">
</span><span class="c1">## | 30| Andy|</span><span class="w">
</span><span class="c1">## | 19| Justin|</span><span class="w">
</span><span class="c1">## +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h2 id="untyped-dataset-operations-aka-dataframe-operations">Untyped Dataset Operations (aka DataFrame Operations)</h2>
<p>DataFrames provide a domain-specific language for structured data manipulation in <a href="api/scala/org/apache/spark/sql/Dataset.html">Scala</a>, <a href="api/java/index.html?org/apache/spark/sql/Dataset.html">Java</a>, <a href="api/python/reference/api/pyspark.sql.DataFrame.html">Python</a> and <a href="api/R/SparkDataFrame.html">R</a>.</p>
<p>As mentioned above, in Spark 2.0, DataFrames are just Dataset of <code class="language-plaintext highlighter-rouge">Row</code>s in Scala and Java API. These operations are also referred as &#8220;untyped transformations&#8221; in contrast to &#8220;typed transformations&#8221; come with strongly typed Scala/Java Datasets.</p>
<p>Here we include some basic examples of structured data processing using Datasets:</p>
<div class="codetabs">
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// This import is needed to use the $-notation</span>
<span class="k">import</span> <span class="nn">spark.implicits._</span>
<span class="c1">// Print the schema in a tree format</span>
<span class="nv">df</span><span class="o">.</span><span class="py">printSchema</span><span class="o">()</span>
<span class="c1">// root</span>
<span class="c1">// |-- age: long (nullable = true)</span>
<span class="c1">// |-- name: string (nullable = true)</span>
<span class="c1">// Select only the "name" column</span>
<span class="nv">df</span><span class="o">.</span><span class="py">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------+</span>
<span class="c1">// | name|</span>
<span class="c1">// +-------+</span>
<span class="c1">// |Michael|</span>
<span class="c1">// | Andy|</span>
<span class="c1">// | Justin|</span>
<span class="c1">// +-------+</span>
<span class="c1">// Select everybody, but increment the age by 1</span>
<span class="nv">df</span><span class="o">.</span><span class="py">select</span><span class="o">(</span><span class="n">$</span><span class="s">"name"</span><span class="o">,</span> <span class="n">$</span><span class="s">"age"</span> <span class="o">+</span> <span class="mi">1</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// | name|(age + 1)|</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// |Michael| null|</span>
<span class="c1">// | Andy| 31|</span>
<span class="c1">// | Justin| 20|</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// Select people older than 21</span>
<span class="nv">df</span><span class="o">.</span><span class="py">filter</span><span class="o">(</span><span class="n">$</span><span class="s">"age"</span> <span class="o">&gt;</span> <span class="mi">21</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +---+----+</span>
<span class="c1">// |age|name|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// | 30|Andy|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// Count people by age</span>
<span class="nv">df</span><span class="o">.</span><span class="py">groupBy</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="py">count</span><span class="o">().</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-----+</span>
<span class="c1">// | age|count|</span>
<span class="c1">// +----+-----+</span>
<span class="c1">// | 19| 1|</span>
<span class="c1">// |null| 1|</span>
<span class="c1">// | 30| 1|</span>
<span class="c1">// +----+-----+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
<p>For a complete list of the types of operations that can be performed on a Dataset, refer to the <a href="api/scala/org/apache/spark/sql/Dataset.html">API Documentation</a>.</p>
<p>In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/scala/org/apache/spark/sql/functions$.html">DataFrame Function Reference</a>.</p>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// col("...") is preferable to df.col("...")</span>
<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">functions</span><span class="o">.</span><span class="na">col</span><span class="o">;</span>
<span class="c1">// Print the schema in a tree format</span>
<span class="n">df</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
<span class="c1">// root</span>
<span class="c1">// |-- age: long (nullable = true)</span>
<span class="c1">// |-- name: string (nullable = true)</span>
<span class="c1">// Select only the "name" column</span>
<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------+</span>
<span class="c1">// | name|</span>
<span class="c1">// +-------+</span>
<span class="c1">// |Michael|</span>
<span class="c1">// | Andy|</span>
<span class="c1">// | Justin|</span>
<span class="c1">// +-------+</span>
<span class="c1">// Select everybody, but increment the age by 1</span>
<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">"name"</span><span class="o">),</span> <span class="n">col</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="na">plus</span><span class="o">(</span><span class="mi">1</span><span class="o">)).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// | name|(age + 1)|</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// |Michael| null|</span>
<span class="c1">// | Andy| 31|</span>
<span class="c1">// | Justin| 20|</span>
<span class="c1">// +-------+---------+</span>
<span class="c1">// Select people older than 21</span>
<span class="n">df</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="na">gt</span><span class="o">(</span><span class="mi">21</span><span class="o">)).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +---+----+</span>
<span class="c1">// |age|name|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// | 30|Andy|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// Count people by age</span>
<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="na">count</span><span class="o">().</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-----+</span>
<span class="c1">// | age|count|</span>
<span class="c1">// +----+-----+</span>
<span class="c1">// | 19| 1|</span>
<span class="c1">// |null| 1|</span>
<span class="c1">// | 30| 1|</span>
<span class="c1">// +----+-----+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
<p>For a complete list of the types of operations that can be performed on a Dataset refer to the <a href="api/java/org/apache/spark/sql/Dataset.html">API Documentation</a>.</p>
<p>In addition to simple column references and expressions, Datasets also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/java/org/apache/spark/sql/functions.html">DataFrame Function Reference</a>.</p>
</div>
<div data-lang="python">
<p>In Python, it&#8217;s possible to access a DataFrame&#8217;s columns either by attribute
(<code class="language-plaintext highlighter-rouge">df.age</code>) or by indexing (<code class="language-plaintext highlighter-rouge">df['age']</code>). While the former is convenient for
interactive data exploration, users are highly encouraged to use the
latter form, which is future proof and won&#8217;t break with column names that
are also attributes on the DataFrame class.</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1"># spark, df are from the previous example
# Print the schema in a tree format
</span><span class="n">df</span><span class="p">.</span><span class="n">printSchema</span><span class="p">()</span>
<span class="c1"># root
# |-- age: long (nullable = true)
# |-- name: string (nullable = true)
</span>
<span class="c1"># Select only the "name" column
</span><span class="n">df</span><span class="p">.</span><span class="n">select</span><span class="p">(</span><span class="s">"name"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+
# | name|
# +-------+
# |Michael|
# | Andy|
# | Justin|
# +-------+
</span>
<span class="c1"># Select everybody, but increment the age by 1
</span><span class="n">df</span><span class="p">.</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s">'name'</span><span class="p">],</span> <span class="n">df</span><span class="p">[</span><span class="s">'age'</span><span class="p">]</span> <span class="o">+</span> <span class="mi">1</span><span class="p">).</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+---------+
# | name|(age + 1)|
# +-------+---------+
# |Michael| null|
# | Andy| 31|
# | Justin| 20|
# +-------+---------+
</span>
<span class="c1"># Select people older than 21
</span><span class="n">df</span><span class="p">.</span><span class="nb">filter</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s">'age'</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mi">21</span><span class="p">).</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+----+
# |age|name|
# +---+----+
# | 30|Andy|
# +---+----+
</span>
<span class="c1"># Count people by age
</span><span class="n">df</span><span class="p">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">"age"</span><span class="p">).</span><span class="n">count</span><span class="p">().</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----+-----+
# | age|count|
# +----+-----+
# | 19| 1|
# |null| 1|
# | 30| 1|
# +----+-----+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
<p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/python/reference/pyspark.sql.html#dataframe-apis">API Documentation</a>.</p>
<p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/python/reference/pyspark.sql.html#functions">DataFrame Function Reference</a>.</p>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="c1"># Create the DataFrame</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.json</span><span class="p">(</span><span class="s2">"examples/src/main/resources/people.json"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Show the content of the DataFrame</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="w">
</span><span class="c1">## age name</span><span class="w">
</span><span class="c1">## 1 NA Michael</span><span class="w">
</span><span class="c1">## 2 30 Andy</span><span class="w">
</span><span class="c1">## 3 19 Justin</span><span class="w">
</span><span class="c1"># Print the schema in a tree format</span><span class="w">
</span><span class="n">printSchema</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="w">
</span><span class="c1">## root</span><span class="w">
</span><span class="c1">## |-- age: long (nullable = true)</span><span class="w">
</span><span class="c1">## |-- name: string (nullable = true)</span><span class="w">
</span><span class="c1"># Select only the "name" column</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"name"</span><span class="p">))</span><span class="w">
</span><span class="c1">## name</span><span class="w">
</span><span class="c1">## 1 Michael</span><span class="w">
</span><span class="c1">## 2 Andy</span><span class="w">
</span><span class="c1">## 3 Justin</span><span class="w">
</span><span class="c1"># Select everybody, but increment the age by 1</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="n">df</span><span class="o">$</span><span class="n">name</span><span class="p">,</span><span class="w"> </span><span class="n">df</span><span class="o">$</span><span class="n">age</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="m">1</span><span class="p">))</span><span class="w">
</span><span class="c1">## name (age + 1.0)</span><span class="w">
</span><span class="c1">## 1 Michael NA</span><span class="w">
</span><span class="c1">## 2 Andy 31</span><span class="w">
</span><span class="c1">## 3 Justin 20</span><span class="w">
</span><span class="c1"># Select people older than 21</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">where</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="n">df</span><span class="o">$</span><span class="n">age</span><span class="w"> </span><span class="o">&gt;</span><span class="w"> </span><span class="m">21</span><span class="p">))</span><span class="w">
</span><span class="c1">## age name</span><span class="w">
</span><span class="c1">## 1 30 Andy</span><span class="w">
</span><span class="c1"># Count people by age</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">count</span><span class="p">(</span><span class="n">groupBy</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"age"</span><span class="p">)))</span><span class="w">
</span><span class="c1">## age count</span><span class="w">
</span><span class="c1">## 1 19 1</span><span class="w">
</span><span class="c1">## 2 NA 1</span><span class="w">
</span><span class="c1">## 3 30 1</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
<p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/R/index.html">API Documentation</a>.</p>
<p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/R/SparkDataFrame.html">DataFrame Function Reference</a>.</p>
</div>
</div>
<h2 id="running-sql-queries-programmatically">Running SQL Queries Programmatically</h2>
<div class="codetabs">
<div data-lang="scala">
<p>The <code class="language-plaintext highlighter-rouge">sql</code> function on a <code class="language-plaintext highlighter-rouge">SparkSession</code> enables applications to run SQL queries programmatically and returns the result as a <code class="language-plaintext highlighter-rouge">DataFrame</code>.</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1">// Register the DataFrame as a SQL temporary view</span>
<span class="nv">df</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">)</span>
<span class="k">val</span> <span class="nv">sqlDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT * FROM people"</span><span class="o">)</span>
<span class="nv">sqlDF</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<p>The <code class="language-plaintext highlighter-rouge">sql</code> function on a <code class="language-plaintext highlighter-rouge">SparkSession</code> enables applications to run SQL queries programmatically and returns the result as a <code class="language-plaintext highlighter-rouge">Dataset&lt;Row&gt;</code>.</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="c1">// Register the DataFrame as a SQL temporary view</span>
<span class="n">df</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM people"</span><span class="o">);</span>
<span class="n">sqlDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<p>The <code class="language-plaintext highlighter-rouge">sql</code> function on a <code class="language-plaintext highlighter-rouge">SparkSession</code> enables applications to run SQL queries programmatically and returns the result as a <code class="language-plaintext highlighter-rouge">DataFrame</code>.</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1"># Register the DataFrame as a SQL temporary view
</span><span class="n">df</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"people"</span><span class="p">)</span>
<span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM people"</span><span class="p">)</span>
<span class="n">sqlDF</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----+-------+
# | age| name|
# +----+-------+
# |null|Michael|
# | 30| Andy|
# | 19| Justin|
# +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<p>The <code class="language-plaintext highlighter-rouge">sql</code> function enables applications to run SQL queries programmatically and returns the result as a <code class="language-plaintext highlighter-rouge">SparkDataFrame</code>.</p>
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT * FROM table"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h2 id="global-temporary-view">Global Temporary View</h2>
<p>Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it
terminates. If you want to have a temporary view that is shared among all sessions and keep alive
until the Spark application terminates, you can create a global temporary view. Global temporary
view is tied to a system preserved database <code class="language-plaintext highlighter-rouge">global_temp</code>, and we must use the qualified name to
refer it, e.g. <code class="language-plaintext highlighter-rouge">SELECT * FROM global_temp.view1</code>.</p>
<div class="codetabs">
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// Register the DataFrame as a global temporary view</span>
<span class="nv">df</span><span class="o">.</span><span class="py">createGlobalTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">)</span>
<span class="c1">// Global temporary view is tied to a system preserved database `global_temp`</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// Global temporary view is cross-session</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">newSession</span><span class="o">().</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// Register the DataFrame as a global temporary view</span>
<span class="n">df</span><span class="o">.</span><span class="na">createGlobalTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">);</span>
<span class="c1">// Global temporary view is tied to a system preserved database `global_temp`</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// Global temporary view is cross-session</span>
<span class="n">spark</span><span class="o">.</span><span class="na">newSession</span><span class="o">().</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="c1"># Register the DataFrame as a global temporary view
</span><span class="n">df</span><span class="p">.</span><span class="n">createGlobalTempView</span><span class="p">(</span><span class="s">"people"</span><span class="p">)</span>
<span class="c1"># Global temporary view is tied to a system preserved database `global_temp`
</span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----+-------+
# | age| name|
# +----+-------+
# |null|Michael|
# | 30| Andy|
# | 19| Justin|
# +----+-------+
</span>
<span class="c1"># Global temporary view is cross-session
</span><span class="n">spark</span><span class="p">.</span><span class="n">newSession</span><span class="p">().</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM global_temp.people"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----+-------+
# | age| name|
# +----+-------+
# |null|Michael|
# | 30| Andy|
# | 19| Justin|
# +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">GLOBAL</span> <span class="k">TEMPORARY</span> <span class="k">VIEW</span> <span class="n">temp_view</span> <span class="k">AS</span> <span class="k">SELECT</span> <span class="n">a</span> <span class="o">+</span> <span class="mi">1</span><span class="p">,</span> <span class="n">b</span> <span class="o">*</span> <span class="mi">2</span> <span class="k">FROM</span> <span class="n">tbl</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">global_temp</span><span class="p">.</span><span class="n">temp_view</span></code></pre></figure>
</div>
</div>
<h2 id="creating-datasets">Creating Datasets</h2>
<p>Datasets are similar to RDDs, however, instead of using Java serialization or Kryo they use
a specialized <a href="api/scala/org/apache/spark/sql/Encoder.html">Encoder</a> to serialize the objects
for processing or transmitting over the network. While both encoders and standard serialization are
responsible for turning an object into bytes, encoders are code generated dynamically and use a format
that allows Spark to perform many operations like filtering, sorting and hashing without deserializing
the bytes back into an object.</p>
<div class="codetabs">
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">case</span> <span class="k">class</span> <span class="nc">Person</span><span class="o">(</span><span class="n">name</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> <span class="n">age</span><span class="k">:</span> <span class="kt">Long</span><span class="o">)</span>
<span class="c1">// Encoders are created for case classes</span>
<span class="k">val</span> <span class="nv">caseClassDS</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span><span class="nc">Person</span><span class="o">(</span><span class="s">"Andy"</span><span class="o">,</span> <span class="mi">32</span><span class="o">)).</span><span class="py">toDS</span><span class="o">()</span>
<span class="nv">caseClassDS</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+---+</span>
<span class="c1">// |name|age|</span>
<span class="c1">// +----+---+</span>
<span class="c1">// |Andy| 32|</span>
<span class="c1">// +----+---+</span>
<span class="c1">// Encoders for most common types are automatically provided by importing spark.implicits._</span>
<span class="k">val</span> <span class="nv">primitiveDS</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mi">2</span><span class="o">,</span> <span class="mi">3</span><span class="o">).</span><span class="py">toDS</span><span class="o">()</span>
<span class="nv">primitiveDS</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="k">_</span> <span class="o">+</span> <span class="mi">1</span><span class="o">).</span><span class="py">collect</span><span class="o">()</span> <span class="c1">// Returns: Array(2, 3, 4)</span>
<span class="c1">// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name</span>
<span class="k">val</span> <span class="nv">path</span> <span class="k">=</span> <span class="s">"examples/src/main/resources/people.json"</span>
<span class="k">val</span> <span class="nv">peopleDS</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">json</span><span class="o">(</span><span class="n">path</span><span class="o">).</span><span class="py">as</span><span class="o">[</span><span class="kt">Person</span><span class="o">]</span>
<span class="nv">peopleDS</span><span class="o">.</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.util.Collections</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.io.Serializable</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.MapFunction</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoder</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoders</span><span class="o">;</span>
<span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Person</span> <span class="kd">implements</span> <span class="nc">Serializable</span> <span class="o">{</span>
<span class="kd">private</span> <span class="nc">String</span> <span class="n">name</span><span class="o">;</span>
<span class="kd">private</span> <span class="kt">long</span> <span class="n">age</span><span class="o">;</span>
<span class="kd">public</span> <span class="nc">String</span> <span class="nf">getName</span><span class="o">()</span> <span class="o">{</span>
<span class="k">return</span> <span class="n">name</span><span class="o">;</span>
<span class="o">}</span>
<span class="kd">public</span> <span class="kt">void</span> <span class="nf">setName</span><span class="o">(</span><span class="nc">String</span> <span class="n">name</span><span class="o">)</span> <span class="o">{</span>
<span class="k">this</span><span class="o">.</span><span class="na">name</span> <span class="o">=</span> <span class="n">name</span><span class="o">;</span>
<span class="o">}</span>
<span class="kd">public</span> <span class="kt">long</span> <span class="nf">getAge</span><span class="o">()</span> <span class="o">{</span>
<span class="k">return</span> <span class="n">age</span><span class="o">;</span>
<span class="o">}</span>
<span class="kd">public</span> <span class="kt">void</span> <span class="nf">setAge</span><span class="o">(</span><span class="kt">long</span> <span class="n">age</span><span class="o">)</span> <span class="o">{</span>
<span class="k">this</span><span class="o">.</span><span class="na">age</span> <span class="o">=</span> <span class="n">age</span><span class="o">;</span>
<span class="o">}</span>
<span class="o">}</span>
<span class="c1">// Create an instance of a Bean class</span>
<span class="nc">Person</span> <span class="n">person</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Person</span><span class="o">();</span>
<span class="n">person</span><span class="o">.</span><span class="na">setName</span><span class="o">(</span><span class="s">"Andy"</span><span class="o">);</span>
<span class="n">person</span><span class="o">.</span><span class="na">setAge</span><span class="o">(</span><span class="mi">32</span><span class="o">);</span>
<span class="c1">// Encoders are created for Java beans</span>
<span class="nc">Encoder</span><span class="o">&lt;</span><span class="nc">Person</span><span class="o">&gt;</span> <span class="n">personEncoder</span> <span class="o">=</span> <span class="nc">Encoders</span><span class="o">.</span><span class="na">bean</span><span class="o">(</span><span class="nc">Person</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Person</span><span class="o">&gt;</span> <span class="n">javaBeanDS</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataset</span><span class="o">(</span>
<span class="nc">Collections</span><span class="o">.</span><span class="na">singletonList</span><span class="o">(</span><span class="n">person</span><span class="o">),</span>
<span class="n">personEncoder</span>
<span class="o">);</span>
<span class="n">javaBeanDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +---+----+</span>
<span class="c1">// |age|name|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// | 32|Andy|</span>
<span class="c1">// +---+----+</span>
<span class="c1">// Encoders for most common types are provided in class Encoders</span>
<span class="nc">Encoder</span><span class="o">&lt;</span><span class="nc">Long</span><span class="o">&gt;</span> <span class="n">longEncoder</span> <span class="o">=</span> <span class="nc">Encoders</span><span class="o">.</span><span class="na">LONG</span><span class="o">();</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Long</span><span class="o">&gt;</span> <span class="n">primitiveDS</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataset</span><span class="o">(</span><span class="nc">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="mi">1L</span><span class="o">,</span> <span class="mi">2L</span><span class="o">,</span> <span class="mi">3L</span><span class="o">),</span> <span class="n">longEncoder</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Long</span><span class="o">&gt;</span> <span class="n">transformedDS</span> <span class="o">=</span> <span class="n">primitiveDS</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
<span class="o">(</span><span class="nc">MapFunction</span><span class="o">&lt;</span><span class="nc">Long</span><span class="o">,</span> <span class="nc">Long</span><span class="o">&gt;)</span> <span class="n">value</span> <span class="o">-&gt;</span> <span class="n">value</span> <span class="o">+</span> <span class="mi">1L</span><span class="o">,</span>
<span class="n">longEncoder</span><span class="o">);</span>
<span class="n">transformedDS</span><span class="o">.</span><span class="na">collect</span><span class="o">();</span> <span class="c1">// Returns [2, 3, 4]</span>
<span class="c1">// DataFrames can be converted to a Dataset by providing a class. Mapping based on name</span>
<span class="nc">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">"examples/src/main/resources/people.json"</span><span class="o">;</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Person</span><span class="o">&gt;</span> <span class="n">peopleDS</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="n">path</span><span class="o">).</span><span class="na">as</span><span class="o">(</span><span class="n">personEncoder</span><span class="o">);</span>
<span class="n">peopleDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// | age| name|</span>
<span class="c1">// +----+-------+</span>
<span class="c1">// |null|Michael|</span>
<span class="c1">// | 30| Andy|</span>
<span class="c1">// | 19| Justin|</span>
<span class="c1">// +----+-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
</div>
<h2 id="interoperating-with-rdds">Interoperating with RDDs</h2>
<p>Spark SQL supports two different methods for converting existing RDDs into Datasets. The first
method uses reflection to infer the schema of an RDD that contains specific types of objects. This
reflection-based approach leads to more concise code and works well when you already know the schema
while writing your Spark application.</p>
<p>The second method for creating Datasets is through a programmatic interface that allows you to
construct a schema and then apply it to an existing RDD. While this method is more verbose, it allows
you to construct Datasets when the columns and their types are not known until runtime.</p>
<h3 id="inferring-the-schema-using-reflection">Inferring the Schema Using Reflection</h3>
<div class="codetabs">
<div data-lang="scala">
<p>The Scala interface for Spark SQL supports automatically converting an RDD containing case classes
to a DataFrame. The case class
defines the schema of the table. The names of the arguments to the case class are read using
reflection and become the names of the columns. Case classes can also be nested or contain complex
types such as <code class="language-plaintext highlighter-rouge">Seq</code>s or <code class="language-plaintext highlighter-rouge">Array</code>s. This RDD can be implicitly converted to a DataFrame and then be
registered as a table. Tables can be used in subsequent SQL statements.</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1">// For implicit conversions from RDDs to DataFrames</span>
<span class="k">import</span> <span class="nn">spark.implicits._</span>
<span class="c1">// Create an RDD of Person objects from a text file, convert it to a Dataframe</span>
<span class="k">val</span> <span class="nv">peopleDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sparkContext</span>
<span class="o">.</span><span class="py">textFile</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="o">)</span>
<span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="nv">_</span><span class="o">.</span><span class="py">split</span><span class="o">(</span><span class="s">","</span><span class="o">))</span>
<span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">attributes</span> <span class="k">=&gt;</span> <span class="nc">Person</span><span class="o">(</span><span class="nf">attributes</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> <span class="nf">attributes</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="py">trim</span><span class="o">.</span><span class="py">toInt</span><span class="o">))</span>
<span class="o">.</span><span class="py">toDF</span><span class="o">()</span>
<span class="c1">// Register the DataFrame as a temporary view</span>
<span class="nv">peopleDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">)</span>
<span class="c1">// SQL statements can be run by using the sql methods provided by Spark</span>
<span class="k">val</span> <span class="nv">teenagersDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT name, age FROM people WHERE age BETWEEN 13 AND 19"</span><span class="o">)</span>
<span class="c1">// The columns of a row in the result can be accessed by field index</span>
<span class="nv">teenagersDF</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">teenager</span> <span class="k">=&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="nf">teenager</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span>
<span class="c1">// or by field name</span>
<span class="nv">teenagersDF</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">teenager</span> <span class="k">=&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="nv">teenager</span><span class="o">.</span><span class="py">getAs</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">"name"</span><span class="o">)).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span>
<span class="c1">// No pre-defined encoders for Dataset[Map[K,V]], define explicitly</span>
<span class="k">implicit</span> <span class="k">val</span> <span class="nv">mapEncoder</span> <span class="k">=</span> <span class="nv">org</span><span class="o">.</span><span class="py">apache</span><span class="o">.</span><span class="py">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">.</span><span class="py">Encoders</span><span class="o">.</span><span class="py">kryo</span><span class="o">[</span><span class="kt">Map</span><span class="o">[</span><span class="kt">String</span>, <span class="kt">Any</span><span class="o">]]</span>
<span class="c1">// Primitive types and case classes can be also defined as</span>
<span class="c1">// implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()</span>
<span class="c1">// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]</span>
<span class="nv">teenagersDF</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">teenager</span> <span class="k">=&gt;</span> <span class="nv">teenager</span><span class="o">.</span><span class="py">getValuesMap</span><span class="o">[</span><span class="kt">Any</span><span class="o">](</span><span class="nc">List</span><span class="o">(</span><span class="s">"name"</span><span class="o">,</span> <span class="s">"age"</span><span class="o">))).</span><span class="py">collect</span><span class="o">()</span>
<span class="c1">// Array(Map("name" -&gt; "Justin", "age" -&gt; 19))</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<p>Spark SQL supports automatically converting an RDD of
<a href="http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly">JavaBeans</a> into a DataFrame.
The <code class="language-plaintext highlighter-rouge">BeanInfo</code>, obtained using reflection, defines the schema of the table. Currently, Spark SQL
does not support JavaBeans that contain <code class="language-plaintext highlighter-rouge">Map</code> field(s). Nested JavaBeans and <code class="language-plaintext highlighter-rouge">List</code> or <code class="language-plaintext highlighter-rouge">Array</code>
fields are supported though. You can create a JavaBean by creating a class that implements
Serializable and has getters and setters for all of its fields.</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.MapFunction</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoder</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoders</span><span class="o">;</span>
<span class="c1">// Create an RDD of Person objects from a text file</span>
<span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">Person</span><span class="o">&gt;</span> <span class="n">peopleRDD</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">()</span>
<span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="o">)</span>
<span class="o">.</span><span class="na">javaRDD</span><span class="o">()</span>
<span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="n">line</span> <span class="o">-&gt;</span> <span class="o">{</span>
<span class="nc">String</span><span class="o">[]</span> <span class="n">parts</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">","</span><span class="o">);</span>
<span class="nc">Person</span> <span class="n">person</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Person</span><span class="o">();</span>
<span class="n">person</span><span class="o">.</span><span class="na">setName</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">0</span><span class="o">]);</span>
<span class="n">person</span><span class="o">.</span><span class="na">setAge</span><span class="o">(</span><span class="nc">Integer</span><span class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">1</span><span class="o">].</span><span class="na">trim</span><span class="o">()));</span>
<span class="k">return</span> <span class="n">person</span><span class="o">;</span>
<span class="o">});</span>
<span class="c1">// Apply a schema to an RDD of JavaBeans to get a DataFrame</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">peopleDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">peopleRDD</span><span class="o">,</span> <span class="nc">Person</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
<span class="c1">// Register the DataFrame as a temporary view</span>
<span class="n">peopleDF</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">);</span>
<span class="c1">// SQL statements can be run by using the sql methods provided by spark</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">teenagersDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT name FROM people WHERE age BETWEEN 13 AND 19"</span><span class="o">);</span>
<span class="c1">// The columns of a row in the result can be accessed by field index</span>
<span class="nc">Encoder</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">stringEncoder</span> <span class="o">=</span> <span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">();</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">teenagerNamesByIndexDF</span> <span class="o">=</span> <span class="n">teenagersDF</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
<span class="o">(</span><span class="nc">MapFunction</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">&gt;)</span> <span class="n">row</span> <span class="o">-&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span>
<span class="n">stringEncoder</span><span class="o">);</span>
<span class="n">teenagerNamesByIndexDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span>
<span class="c1">// or by field name</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">teenagerNamesByFieldDF</span> <span class="o">=</span> <span class="n">teenagersDF</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
<span class="o">(</span><span class="nc">MapFunction</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">&gt;)</span> <span class="n">row</span> <span class="o">-&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.&lt;</span><span class="nc">String</span><span class="o">&gt;</span><span class="n">getAs</span><span class="o">(</span><span class="s">"name"</span><span class="o">),</span>
<span class="n">stringEncoder</span><span class="o">);</span>
<span class="n">teenagerNamesByFieldDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<p>Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of
key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table,
and the types are inferred by sampling the whole dataset, similar to the inference that is performed on JSON files.</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sparkContext</span>
<span class="c1"># Load a text file and convert each line to a Row.
</span><span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="p">)</span>
<span class="n">parts</span> <span class="o">=</span> <span class="n">lines</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="n">l</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">","</span><span class="p">))</span>
<span class="n">people</span> <span class="o">=</span> <span class="n">parts</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">age</span><span class="o">=</span><span class="nb">int</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">1</span><span class="p">])))</span>
<span class="c1"># Infer the schema, and register the DataFrame as a table.
</span><span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">people</span><span class="p">)</span>
<span class="n">schemaPeople</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"people"</span><span class="p">)</span>
<span class="c1"># SQL can be run over DataFrames that have been registered as a table.
</span><span class="n">teenagers</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19"</span><span class="p">)</span>
<span class="c1"># The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
</span><span class="n">teenNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="p">.</span><span class="n">rdd</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="n">p</span><span class="p">.</span><span class="n">name</span><span class="p">).</span><span class="n">collect</span><span class="p">()</span>
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">teenNames</span><span class="p">:</span>
<span class="k">print</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="c1"># Name: Justin</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
</div>
<h3 id="programmatically-specifying-the-schema">Programmatically Specifying the Schema</h3>
<div class="codetabs">
<div data-lang="scala">
<p>When case classes cannot be defined ahead of time (for example,
the structure of records is encoded in a string, or a text dataset will be parsed
and fields will be projected differently for different users),
a <code class="language-plaintext highlighter-rouge">DataFrame</code> can be created programmatically with three steps.</p>
<ol>
<li>Create an RDD of <code class="language-plaintext highlighter-rouge">Row</code>s from the original RDD;</li>
<li>Create the schema represented by a <code class="language-plaintext highlighter-rouge">StructType</code> matching the structure of
<code class="language-plaintext highlighter-rouge">Row</code>s in the RDD created in Step 1.</li>
<li>Apply the schema to the RDD of <code class="language-plaintext highlighter-rouge">Row</code>s via <code class="language-plaintext highlighter-rouge">createDataFrame</code> method provided
by <code class="language-plaintext highlighter-rouge">SparkSession</code>.</li>
</ol>
<p>For example:</p>
<div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.sql.Row</span>
<span class="k">import</span> <span class="nn">org.apache.spark.sql.types._</span>
<span class="c1">// Create an RDD</span>
<span class="k">val</span> <span class="nv">peopleRDD</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sparkContext</span><span class="o">.</span><span class="py">textFile</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="o">)</span>
<span class="c1">// The schema is encoded in a string</span>
<span class="k">val</span> <span class="nv">schemaString</span> <span class="k">=</span> <span class="s">"name age"</span>
<span class="c1">// Generate the schema based on the string of schema</span>
<span class="k">val</span> <span class="nv">fields</span> <span class="k">=</span> <span class="nv">schemaString</span><span class="o">.</span><span class="py">split</span><span class="o">(</span><span class="s">" "</span><span class="o">)</span>
<span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">fieldName</span> <span class="k">=&gt;</span> <span class="nc">StructField</span><span class="o">(</span><span class="n">fieldName</span><span class="o">,</span> <span class="nc">StringType</span><span class="o">,</span> <span class="n">nullable</span> <span class="k">=</span> <span class="kc">true</span><span class="o">))</span>
<span class="k">val</span> <span class="nv">schema</span> <span class="k">=</span> <span class="nc">StructType</span><span class="o">(</span><span class="n">fields</span><span class="o">)</span>
<span class="c1">// Convert records of the RDD (people) to Rows</span>
<span class="k">val</span> <span class="nv">rowRDD</span> <span class="k">=</span> <span class="n">peopleRDD</span>
<span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="nv">_</span><span class="o">.</span><span class="py">split</span><span class="o">(</span><span class="s">","</span><span class="o">))</span>
<span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">attributes</span> <span class="k">=&gt;</span> <span class="nc">Row</span><span class="o">(</span><span class="nf">attributes</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> <span class="nf">attributes</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="py">trim</span><span class="o">))</span>
<span class="c1">// Apply the schema to the RDD</span>
<span class="k">val</span> <span class="nv">peopleDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">createDataFrame</span><span class="o">(</span><span class="n">rowRDD</span><span class="o">,</span> <span class="n">schema</span><span class="o">)</span>
<span class="c1">// Creates a temporary view using the DataFrame</span>
<span class="nv">peopleDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">)</span>
<span class="c1">// SQL can be run over a temporary view created using DataFrames</span>
<span class="k">val</span> <span class="nv">results</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT name FROM people"</span><span class="o">)</span>
<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations</span>
<span class="c1">// The columns of a row in the result can be accessed by field index or by field name</span>
<span class="nv">results</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">attributes</span> <span class="k">=&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="nf">attributes</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |Name: Michael|</span>
<span class="c1">// | Name: Andy|</span>
<span class="c1">// | Name: Justin|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<p>When JavaBean classes cannot be defined ahead of time (for example,
the structure of records is encoded in a string, or a text dataset will be parsed and
fields will be projected differently for different users),
a <code class="language-plaintext highlighter-rouge">Dataset&lt;Row&gt;</code> can be created programmatically with three steps.</p>
<ol>
<li>Create an RDD of <code class="language-plaintext highlighter-rouge">Row</code>s from the original RDD;</li>
<li>Create the schema represented by a <code class="language-plaintext highlighter-rouge">StructType</code> matching the structure of
<code class="language-plaintext highlighter-rouge">Row</code>s in the RDD created in Step 1.</li>
<li>Apply the schema to the RDD of <code class="language-plaintext highlighter-rouge">Row</code>s via <code class="language-plaintext highlighter-rouge">createDataFrame</code> method provided
by <code class="language-plaintext highlighter-rouge">SparkSession</code>.</li>
</ol>
<p>For example:</p>
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
<span class="c1">// Create an RDD</span>
<span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">peopleRDD</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sparkContext</span><span class="o">()</span>
<span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="o">,</span> <span class="mi">1</span><span class="o">)</span>
<span class="o">.</span><span class="na">toJavaRDD</span><span class="o">();</span>
<span class="c1">// The schema is encoded in a string</span>
<span class="nc">String</span> <span class="n">schemaString</span> <span class="o">=</span> <span class="s">"name age"</span><span class="o">;</span>
<span class="c1">// Generate the schema based on the string of schema</span>
<span class="nc">List</span><span class="o">&lt;</span><span class="nc">StructField</span><span class="o">&gt;</span> <span class="n">fields</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ArrayList</span><span class="o">&lt;&gt;();</span>
<span class="k">for</span> <span class="o">(</span><span class="nc">String</span> <span class="n">fieldName</span> <span class="o">:</span> <span class="n">schemaString</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">))</span> <span class="o">{</span>
<span class="nc">StructField</span> <span class="n">field</span> <span class="o">=</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="n">fieldName</span><span class="o">,</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">);</span>
<span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">field</span><span class="o">);</span>
<span class="o">}</span>
<span class="nc">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">createStructType</span><span class="o">(</span><span class="n">fields</span><span class="o">);</span>
<span class="c1">// Convert records of the RDD (people) to Rows</span>
<span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">rowRDD</span> <span class="o">=</span> <span class="n">peopleRDD</span><span class="o">.</span><span class="na">map</span><span class="o">((</span><span class="nc">Function</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">,</span> <span class="nc">Row</span><span class="o">&gt;)</span> <span class="n">record</span> <span class="o">-&gt;</span> <span class="o">{</span>
<span class="nc">String</span><span class="o">[]</span> <span class="n">attributes</span> <span class="o">=</span> <span class="n">record</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">","</span><span class="o">);</span>
<span class="k">return</span> <span class="nc">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">attributes</span><span class="o">[</span><span class="mi">0</span><span class="o">],</span> <span class="n">attributes</span><span class="o">[</span><span class="mi">1</span><span class="o">].</span><span class="na">trim</span><span class="o">());</span>
<span class="o">});</span>
<span class="c1">// Apply the schema to the RDD</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">peopleDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rowRDD</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
<span class="c1">// Creates a temporary view using the DataFrame</span>
<span class="n">peopleDataFrame</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">);</span>
<span class="c1">// SQL can be run over a temporary view created using DataFrames</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">results</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT name FROM people"</span><span class="o">);</span>
<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations</span>
<span class="c1">// The columns of a row in the result can be accessed by field index or by field name</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">namesDS</span> <span class="o">=</span> <span class="n">results</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
<span class="o">(</span><span class="nc">MapFunction</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">&gt;)</span> <span class="n">row</span> <span class="o">-&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span>
<span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">());</span>
<span class="n">namesDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +-------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +-------------+</span>
<span class="c1">// |Name: Michael|</span>
<span class="c1">// | Name: Andy|</span>
<span class="c1">// | Name: Justin|</span>
<span class="c1">// +-------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="python">
<p>When a dictionary of kwargs cannot be defined ahead of time (for example,
the structure of records is encoded in a string, or a text dataset will be parsed and
fields will be projected differently for different users),
a <code class="language-plaintext highlighter-rouge">DataFrame</code> can be created programmatically with three steps.</p>
<ol>
<li>Create an RDD of tuples or lists from the original RDD;</li>
<li>Create the schema represented by a <code class="language-plaintext highlighter-rouge">StructType</code> matching the structure of
tuples or lists in the RDD created in the step 1.</li>
<li>Apply the schema to the RDD via <code class="language-plaintext highlighter-rouge">createDataFrame</code> method provided by <code class="language-plaintext highlighter-rouge">SparkSession</code>.</li>
</ol>
<p>For example:</p>
<div class="highlight"><pre class="codehilite"><code><span class="c1"># Import data types
</span><span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">StructField</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sparkContext</span>
<span class="c1"># Load a text file and convert each line to a Row.
</span><span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.txt"</span><span class="p">)</span>
<span class="n">parts</span> <span class="o">=</span> <span class="n">lines</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="n">l</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">","</span><span class="p">))</span>
<span class="c1"># Each line is converted to a tuple.
</span><span class="n">people</span> <span class="o">=</span> <span class="n">parts</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">p</span><span class="p">[</span><span class="mi">1</span><span class="p">].</span><span class="n">strip</span><span class="p">()))</span>
<span class="c1"># The schema is encoded in a string.
</span><span class="n">schemaString</span> <span class="o">=</span> <span class="s">"name age"</span>
<span class="n">fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">StructField</span><span class="p">(</span><span class="n">field_name</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="n">schemaString</span><span class="p">.</span><span class="n">split</span><span class="p">()]</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">(</span><span class="n">fields</span><span class="p">)</span>
<span class="c1"># Apply the schema to the RDD.
</span><span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">people</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="c1"># Creates a temporary view using the DataFrame
</span><span class="n">schemaPeople</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"people"</span><span class="p">)</span>
<span class="c1"># SQL can be run over DataFrames that have been registered as a table.
</span><span class="n">results</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT name FROM people"</span><span class="p">)</span>
<span class="n">results</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+
# | name|
# +-------+
# |Michael|
# | Andy|
# | Justin|
# +-------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/basic.py" in the Spark repo.</small></div>
</div>
</div>
<h2 id="scalar-functions">Scalar Functions</h2>
<p>Scalar functions are functions that return a single value per row, as opposed to aggregation functions, which return a value for a group of rows. Spark SQL supports a variety of <a href="sql-ref-functions.html#scalar-functions">Built-in Scalar Functions</a>. It also supports <a href="sql-ref-functions-udf-scalar.html">User Defined Scalar Functions</a>.</p>
<h2 id="aggregate-functions">Aggregate Functions</h2>
<p>Aggregate functions are functions that return a single value on a group of rows. The <a href="sql-ref-functions-builtin.html#aggregate-functions">Built-in Aggregation Functions</a> provide common aggregations such as <code class="language-plaintext highlighter-rouge">count()</code>, <code class="language-plaintext highlighter-rouge">count_distinct()</code>, <code class="language-plaintext highlighter-rouge">avg()</code>, <code class="language-plaintext highlighter-rouge">max()</code>, <code class="language-plaintext highlighter-rouge">min()</code>, etc.
Users are not limited to the predefined aggregate functions and can create their own. For more details
about user defined aggregate functions, please refer to the documentation of
<a href="sql-ref-functions-udf-aggregate.html">User Defined Aggregate Functions</a>.</p>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'b18ca3732c502995563043aa17bc6ecb',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.2.0"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>