blob: 6f133a7a376c6f7bd9bca11b521baad2e8c9cabf [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Generic Load/Save Functions - Spark 3.5.5 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.5</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.5</span></span>-->
</div>
</nav>
<div class="container">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<ul>
<li>
<a href="sql-data-sources-load-save-functions.html">
Generic Load/Save Functions
</a>
</li>
<li>
<a href="sql-data-sources-generic-options.html">
Generic File Source Options
</a>
</li>
<li>
<a href="sql-data-sources-parquet.html">
Parquet Files
</a>
</li>
<li>
<a href="sql-data-sources-orc.html">
ORC Files
</a>
</li>
<li>
<a href="sql-data-sources-json.html">
JSON Files
</a>
</li>
<li>
<a href="sql-data-sources-csv.html">
CSV Files
</a>
</li>
<li>
<a href="sql-data-sources-text.html">
Text Files
</a>
</li>
<li>
<a href="sql-data-sources-hive-tables.html">
Hive Tables
</a>
</li>
<li>
<a href="sql-data-sources-jdbc.html">
JDBC To Other Databases
</a>
</li>
<li>
<a href="sql-data-sources-avro.html">
Avro Files
</a>
</li>
<li>
<a href="sql-data-sources-protobuf.html">
Protobuf data
</a>
</li>
<li>
<a href="sql-data-sources-binaryFile.html">
Whole Binary Files
</a>
</li>
<li>
<a href="sql-data-sources-troubleshooting.html">
Troubleshooting
</a>
</li>
</ul>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-guide.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<li>
<a href="sql-error-conditions.html">
Error Conditions
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Generic Load/Save Functions</h1>
<ul id="markdown-toc">
<li><a href="#manually-specifying-options" id="markdown-toc-manually-specifying-options">Manually Specifying Options</a></li>
<li><a href="#run-sql-on-files-directly" id="markdown-toc-run-sql-on-files-directly">Run SQL on files directly</a></li>
<li><a href="#save-modes" id="markdown-toc-save-modes">Save Modes</a></li>
<li><a href="#saving-to-persistent-tables" id="markdown-toc-saving-to-persistent-tables">Saving to Persistent Tables</a></li>
<li><a href="#bucketing-sorting-and-partitioning" id="markdown-toc-bucketing-sorting-and-partitioning">Bucketing, Sorting and Partitioning</a></li>
</ul>
<p>In the simplest form, the default data source (<code class="language-plaintext highlighter-rouge">parquet</code> unless otherwise configured by
<code class="language-plaintext highlighter-rouge">spark.sql.sources.default</code>) will be used for all operations.</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/users.parquet"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="n">select</span><span class="p">(</span><span class="s">"name"</span><span class="p">,</span> <span class="s">"favorite_color"</span><span class="p">).</span><span class="n">write</span><span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="s">"namesAndFavColors.parquet"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">usersDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/users.parquet"</span><span class="o">)</span>
<span class="nv">usersDF</span><span class="o">.</span><span class="py">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">,</span> <span class="s">"favorite_color"</span><span class="o">).</span><span class="py">write</span><span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="s">"namesAndFavColors.parquet"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">usersDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/users.parquet"</span><span class="o">);</span>
<span class="n">usersDF</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">,</span> <span class="s">"favorite_color"</span><span class="o">).</span><span class="na">write</span><span class="o">().</span><span class="na">save</span><span class="o">(</span><span class="s">"namesAndFavColors.parquet"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/users.parquet"</span><span class="p">)</span><span class="w">
</span><span class="n">write.df</span><span class="p">(</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"name"</span><span class="p">,</span><span class="w"> </span><span class="s2">"favorite_color"</span><span class="p">),</span><span class="w"> </span><span class="s2">"namesAndFavColors.parquet"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="manually-specifying-options">Manually Specifying Options</h3>
<p>You can also manually specify the data source that will be used along with any extra options
that you would like to pass to the data source. Data sources are specified by their fully qualified
name (i.e., <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.parquet</code>), but for built-in sources you can also use their short
names (<code class="language-plaintext highlighter-rouge">json</code>, <code class="language-plaintext highlighter-rouge">parquet</code>, <code class="language-plaintext highlighter-rouge">jdbc</code>, <code class="language-plaintext highlighter-rouge">orc</code>, <code class="language-plaintext highlighter-rouge">libsvm</code>, <code class="language-plaintext highlighter-rouge">csv</code>, <code class="language-plaintext highlighter-rouge">text</code>). DataFrames loaded from any data
source type can be converted into other types using this syntax.</p>
<p>Please refer the API documentation for available options of built-in sources, for example,
<code class="language-plaintext highlighter-rouge">org.apache.spark.sql.DataFrameReader</code> and <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.DataFrameWriter</code>. The
options documented there should be applicable through non-Scala Spark APIs (e.g. PySpark)
as well. For other formats, refer to the API documentation of the particular format.</p>
<p>To load a JSON file you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s">"json"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="n">select</span><span class="p">(</span><span class="s">"name"</span><span class="p">,</span> <span class="s">"age"</span><span class="p">).</span><span class="n">write</span><span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="s">"namesAndAges.parquet"</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s">"parquet"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">peopleDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"json"</span><span class="o">).</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">)</span>
<span class="nv">peopleDF</span><span class="o">.</span><span class="py">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">,</span> <span class="s">"age"</span><span class="o">).</span><span class="py">write</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">).</span><span class="py">save</span><span class="o">(</span><span class="s">"namesAndAges.parquet"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">peopleDF</span> <span class="o">=</span>
<span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"json"</span><span class="o">).</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">);</span>
<span class="n">peopleDF</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"name"</span><span class="o">,</span> <span class="s">"age"</span><span class="o">).</span><span class="na">write</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">).</span><span class="na">save</span><span class="o">(</span><span class="s">"namesAndAges.parquet"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/people.json"</span><span class="p">,</span><span class="w"> </span><span class="s2">"json"</span><span class="p">)</span><span class="w">
</span><span class="n">namesAndAges</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"name"</span><span class="p">,</span><span class="w"> </span><span class="s2">"age"</span><span class="p">)</span><span class="w">
</span><span class="n">write.df</span><span class="p">(</span><span class="n">namesAndAges</span><span class="p">,</span><span class="w"> </span><span class="s2">"namesAndAges.parquet"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<p>To load a CSV file you can use:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.csv"</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="s">"csv"</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s">";"</span><span class="p">,</span> <span class="n">inferSchema</span><span class="o">=</span><span class="s">"true"</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="s">"true"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">peopleDFCsv</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"csv"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"sep"</span><span class="o">,</span> <span class="s">";"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"inferSchema"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"header"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.csv"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">peopleDFCsv</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"csv"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"sep"</span><span class="o">,</span> <span class="s">";"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"inferSchema"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"header"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.csv"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/people.csv"</span><span class="p">,</span><span class="w"> </span><span class="s2">"csv"</span><span class="p">,</span><span class="w"> </span><span class="n">sep</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">";"</span><span class="p">,</span><span class="w"> </span><span class="n">inferSchema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="kc">TRUE</span><span class="p">,</span><span class="w"> </span><span class="n">header</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="kc">TRUE</span><span class="p">)</span><span class="w">
</span><span class="n">namesAndAges</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"name"</span><span class="p">,</span><span class="w"> </span><span class="s2">"age"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<p>The extra options are also used during write operation.
For example, you can control bloom filters and dictionary encodings for ORC data sources.
The following ORC example will create bloom filter and use dictionary encoding only for <code class="language-plaintext highlighter-rouge">favorite_color</code>.
For Parquet, there exists <code class="language-plaintext highlighter-rouge">parquet.bloom.filter.enabled</code> and <code class="language-plaintext highlighter-rouge">parquet.enable.dictionary</code>, too.
To find more detailed information about the extra ORC/Parquet options,
visit the official Apache <a href="https://orc.apache.org/docs/spark-config.html">ORC</a> / <a href="https://github.com/apache/parquet-mr/tree/master/parquet-hadoop">Parquet</a> websites.</p>
<p>ORC data source:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">orc</span><span class="p">(</span><span class="s">"examples/src/main/resources/users.orc"</span><span class="p">)</span>
<span class="p">(</span><span class="n">df</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="nb">format</span><span class="p">(</span><span class="s">"orc"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"orc.bloom.filter.columns"</span><span class="p">,</span> <span class="s">"favorite_color"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"orc.dictionary.key.threshold"</span><span class="p">,</span> <span class="s">"1.0"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"orc.column.encoding.direct"</span><span class="p">,</span> <span class="s">"name"</span><span class="p">)</span>
<span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="s">"users_with_options.orc"</span><span class="p">))</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="nv">usersDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"orc"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"orc.bloom.filter.columns"</span><span class="o">,</span> <span class="s">"favorite_color"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"orc.dictionary.key.threshold"</span><span class="o">,</span> <span class="s">"1.0"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"orc.column.encoding.direct"</span><span class="o">,</span> <span class="s">"name"</span><span class="o">)</span>
<span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="s">"users_with_options.orc"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="n">usersDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"orc"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"orc.bloom.filter.columns"</span><span class="o">,</span> <span class="s">"favorite_color"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"orc.dictionary.key.threshold"</span><span class="o">,</span> <span class="s">"1.0"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"orc.column.encoding.direct"</span><span class="o">,</span> <span class="s">"name"</span><span class="o">)</span>
<span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="s">"users_with_options.orc"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/users.orc"</span><span class="p">,</span><span class="w"> </span><span class="s2">"orc"</span><span class="p">)</span><span class="w">
</span><span class="n">write.orc</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"users_with_options.orc"</span><span class="p">,</span><span class="w"> </span><span class="n">orc.bloom.filter.columns</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"favorite_color"</span><span class="p">,</span><span class="w"> </span><span class="n">orc.dictionary.key.threshold</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="m">1.0</span><span class="p">,</span><span class="w"> </span><span class="n">orc.column.encoding.direct</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"name"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">users_with_options</span> <span class="p">(</span>
<span class="n">name</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_color</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_numbers</span> <span class="n">array</span><span class="o">&lt;</span><span class="nb">integer</span><span class="o">&gt;</span>
<span class="p">)</span> <span class="k">USING</span> <span class="n">ORC</span>
<span class="k">OPTIONS</span> <span class="p">(</span>
<span class="n">orc</span><span class="p">.</span><span class="n">bloom</span><span class="p">.</span><span class="n">filter</span><span class="p">.</span><span class="n">columns</span> <span class="s1">'favorite_color'</span><span class="p">,</span>
<span class="n">orc</span><span class="p">.</span><span class="k">dictionary</span><span class="p">.</span><span class="k">key</span><span class="p">.</span><span class="n">threshold</span> <span class="s1">'1.0'</span><span class="p">,</span>
<span class="n">orc</span><span class="p">.</span><span class="k">column</span><span class="p">.</span><span class="k">encoding</span><span class="p">.</span><span class="n">direct</span> <span class="s1">'name'</span>
<span class="p">)</span></code></pre></figure>
</div>
</div>
<p>Parquet data source:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"examples/src/main/resources/users.parquet"</span><span class="p">)</span>
<span class="p">(</span><span class="n">df</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="nb">format</span><span class="p">(</span><span class="s">"parquet"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.bloom.filter.enabled#favorite_color"</span><span class="p">,</span> <span class="s">"true"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.bloom.filter.expected.ndv#favorite_color"</span><span class="p">,</span> <span class="s">"1000000"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.enable.dictionary"</span><span class="p">,</span> <span class="s">"true"</span><span class="p">)</span>
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.page.write-checksum.enabled"</span><span class="p">,</span> <span class="s">"false"</span><span class="p">)</span>
<span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="s">"users_with_options.parquet"</span><span class="p">))</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="nv">usersDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"parquet.bloom.filter.enabled#favorite_color"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"parquet.bloom.filter.expected.ndv#favorite_color"</span><span class="o">,</span> <span class="s">"1000000"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"parquet.enable.dictionary"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"parquet.page.write-checksum.enabled"</span><span class="o">,</span> <span class="s">"false"</span><span class="o">)</span>
<span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="s">"users_with_options.parquet"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="n">usersDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"parquet.bloom.filter.enabled#favorite_color"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"parquet.bloom.filter.expected.ndv#favorite_color"</span><span class="o">,</span> <span class="s">"1000000"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"parquet.enable.dictionary"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span>
<span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"parquet.page.write-checksum.enabled"</span><span class="o">,</span> <span class="s">"false"</span><span class="o">)</span>
<span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="s">"users_with_options.parquet"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/users.parquet"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">)</span><span class="w">
</span><span class="n">write.parquet</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"users_with_options.parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">parquet.bloom.filter.enabled</span><span class="c1">#favorite_color = true, parquet.bloom.filter.expected.ndv#favorite_color = 1000000, parquet.enable.dictionary = true, parquet.page.write-checksum.enabled = false)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">users_with_options</span> <span class="p">(</span>
<span class="n">name</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_color</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_numbers</span> <span class="n">array</span><span class="o">&lt;</span><span class="nb">integer</span><span class="o">&gt;</span>
<span class="p">)</span> <span class="k">USING</span> <span class="n">parquet</span>
<span class="k">OPTIONS</span> <span class="p">(</span>
<span class="nv">`parquet.bloom.filter.enabled#favorite_color`</span> <span class="k">true</span><span class="p">,</span>
<span class="nv">`parquet.bloom.filter.expected.ndv#favorite_color`</span> <span class="mi">1000000</span><span class="p">,</span>
<span class="n">parquet</span><span class="p">.</span><span class="n">enable</span><span class="p">.</span><span class="k">dictionary</span> <span class="k">true</span><span class="p">,</span>
<span class="n">parquet</span><span class="p">.</span><span class="n">page</span><span class="p">.</span><span class="k">write</span><span class="o">-</span><span class="n">checksum</span><span class="p">.</span><span class="n">enabled</span> <span class="k">true</span>
<span class="p">)</span></code></pre></figure>
</div>
</div>
<h3 id="run-sql-on-files-directly">Run SQL on files directly</h3>
<p>Instead of using read API to load a file into DataFrame and query it, you can also query that
file directly with SQL.</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="k">val</span> <span class="nv">sqlDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">sqlDF</span> <span class="o">=</span>
<span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="save-modes">Save Modes</h3>
<p>Save operations can optionally take a <code class="language-plaintext highlighter-rouge">SaveMode</code>, that specifies how to handle existing data if
present. It is important to realize that these save modes do not utilize any locking and are not
atomic. Additionally, when performing an <code class="language-plaintext highlighter-rouge">Overwrite</code>, the data will be deleted before writing out the
new data.</p>
<table>
<thead><tr><th>Scala/Java</th><th>Any Language</th><th>Meaning</th></tr></thead>
<tr>
<td><code>SaveMode.ErrorIfExists</code> (default)</td>
<td><code>"error" or "errorifexists"</code> (default)</td>
<td>
When saving a DataFrame to a data source, if data already exists,
an exception is expected to be thrown.
</td>
</tr>
<tr>
<td><code>SaveMode.Append</code></td>
<td><code>"append"</code></td>
<td>
When saving a DataFrame to a data source, if data/table already exists,
contents of the DataFrame are expected to be appended to existing data.
</td>
</tr>
<tr>
<td><code>SaveMode.Overwrite</code></td>
<td><code>"overwrite"</code></td>
<td>
Overwrite mode means that when saving a DataFrame to a data source,
if data/table already exists, existing data is expected to be overwritten by the contents of
the DataFrame.
</td>
</tr>
<tr>
<td><code>SaveMode.Ignore</code></td>
<td><code>"ignore"</code></td>
<td>
Ignore mode means that when saving a DataFrame to a data source, if data already exists,
the save operation is expected not to save the contents of the DataFrame and not to
change the existing data. This is similar to a <code>CREATE TABLE IF NOT EXISTS</code> in SQL.
</td>
</tr>
</table>
<h3 id="saving-to-persistent-tables">Saving to Persistent Tables</h3>
<p><code class="language-plaintext highlighter-rouge">DataFrames</code> can also be saved as persistent tables into Hive metastore using the <code class="language-plaintext highlighter-rouge">saveAsTable</code>
command. Notice that an existing Hive deployment is not necessary to use this feature. Spark will create a
default local Hive metastore (using Derby) for you. Unlike the <code class="language-plaintext highlighter-rouge">createOrReplaceTempView</code> command,
<code class="language-plaintext highlighter-rouge">saveAsTable</code> will materialize the contents of the DataFrame and create a pointer to the data in the
Hive metastore. Persistent tables will still exist even after your Spark program has restarted, as
long as you maintain your connection to the same metastore. A DataFrame for a persistent table can
be created by calling the <code class="language-plaintext highlighter-rouge">table</code> method on a <code class="language-plaintext highlighter-rouge">SparkSession</code> with the name of the table.</p>
<p>For file-based data source, e.g. text, parquet, json, etc. you can specify a custom table path via the
<code class="language-plaintext highlighter-rouge">path</code> option, e.g. <code class="language-plaintext highlighter-rouge">df.write.option("path", "/some/path").saveAsTable("t")</code>. When the table is dropped,
the custom table path will not be removed and the table data is still there. If no custom table path is
specified, Spark will write data to a default table path under the warehouse directory. When the table is
dropped, the default table path will be removed too.</p>
<p>Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits:</p>
<ul>
<li>Since the metastore can return only necessary partitions for a query, discovering all the partitions on the first query to the table is no longer needed.</li>
<li>Hive DDLs such as <code class="language-plaintext highlighter-rouge">ALTER TABLE PARTITION ... SET LOCATION</code> are now available for tables created with the Datasource API.</li>
</ul>
<p>Note that partition information is not gathered by default when creating external datasource tables (those with a <code class="language-plaintext highlighter-rouge">path</code> option). To sync the partition information in the metastore, you can invoke <code class="language-plaintext highlighter-rouge">MSCK REPAIR TABLE</code>.</p>
<h3 id="bucketing-sorting-and-partitioning">Bucketing, Sorting and Partitioning</h3>
<p>For file-based data source, it is also possible to bucket and sort or partition the output.
Bucketing and sorting are applicable only to persistent tables:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="n">bucketBy</span><span class="p">(</span><span class="mi">42</span><span class="p">,</span> <span class="s">"name"</span><span class="p">).</span><span class="n">sortBy</span><span class="p">(</span><span class="s">"age"</span><span class="p">).</span><span class="n">saveAsTable</span><span class="p">(</span><span class="s">"people_bucketed"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="nv">peopleDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">bucketBy</span><span class="o">(</span><span class="mi">42</span><span class="o">,</span> <span class="s">"name"</span><span class="o">).</span><span class="py">sortBy</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"people_bucketed"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="n">peopleDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">bucketBy</span><span class="o">(</span><span class="mi">42</span><span class="o">,</span> <span class="s">"name"</span><span class="o">).</span><span class="na">sortBy</span><span class="o">(</span><span class="s">"age"</span><span class="o">).</span><span class="na">saveAsTable</span><span class="o">(</span><span class="s">"people_bucketed"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">users_bucketed_by_name</span><span class="p">(</span>
<span class="n">name</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_color</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_numbers</span> <span class="n">array</span><span class="o">&lt;</span><span class="nb">integer</span><span class="o">&gt;</span>
<span class="p">)</span> <span class="k">USING</span> <span class="n">parquet</span>
<span class="n">CLUSTERED</span> <span class="k">BY</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">INTO</span> <span class="mi">42</span> <span class="n">BUCKETS</span><span class="p">;</span></code></pre></figure>
</div>
</div>
<p>while partitioning can be used with both <code class="language-plaintext highlighter-rouge">save</code> and <code class="language-plaintext highlighter-rouge">saveAsTable</code> when using the Dataset APIs.</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="s">"favorite_color"</span><span class="p">).</span><span class="nb">format</span><span class="p">(</span><span class="s">"parquet"</span><span class="p">).</span><span class="n">save</span><span class="p">(</span><span class="s">"namesPartByColor.parquet"</span><span class="p">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="nv">usersDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">partitionBy</span><span class="o">(</span><span class="s">"favorite_color"</span><span class="o">).</span><span class="py">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">).</span><span class="py">save</span><span class="o">(</span><span class="s">"namesPartByColor.parquet"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="n">usersDF</span>
<span class="o">.</span><span class="na">write</span><span class="o">()</span>
<span class="o">.</span><span class="na">partitionBy</span><span class="o">(</span><span class="s">"favorite_color"</span><span class="o">)</span>
<span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">"parquet"</span><span class="o">)</span>
<span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="s">"namesPartByColor.parquet"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">users_by_favorite_color</span><span class="p">(</span>
<span class="n">name</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_color</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_numbers</span> <span class="n">array</span><span class="o">&lt;</span><span class="nb">integer</span><span class="o">&gt;</span>
<span class="p">)</span> <span class="k">USING</span> <span class="n">csv</span> <span class="n">PARTITIONED</span> <span class="k">BY</span><span class="p">(</span><span class="n">favorite_color</span><span class="p">);</span></code></pre></figure>
</div>
</div>
<p>It is possible to use both partitioning and bucketing for a single table:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"examples/src/main/resources/users.parquet"</span><span class="p">)</span>
<span class="p">(</span><span class="n">df</span>
<span class="p">.</span><span class="n">write</span>
<span class="p">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="s">"favorite_color"</span><span class="p">)</span>
<span class="p">.</span><span class="n">bucketBy</span><span class="p">(</span><span class="mi">42</span><span class="p">,</span> <span class="s">"name"</span><span class="p">)</span>
<span class="p">.</span><span class="n">saveAsTable</span><span class="p">(</span><span class="s">"users_partitioned_bucketed"</span><span class="p">))</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="n">usersDF</span>
<span class="o">.</span><span class="py">write</span>
<span class="o">.</span><span class="py">partitionBy</span><span class="o">(</span><span class="s">"favorite_color"</span><span class="o">)</span>
<span class="o">.</span><span class="py">bucketBy</span><span class="o">(</span><span class="mi">42</span><span class="o">,</span> <span class="s">"name"</span><span class="o">)</span>
<span class="o">.</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"users_partitioned_bucketed"</span><span class="o">)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="n">usersDF</span>
<span class="o">.</span><span class="na">write</span><span class="o">()</span>
<span class="o">.</span><span class="na">partitionBy</span><span class="o">(</span><span class="s">"favorite_color"</span><span class="o">)</span>
<span class="o">.</span><span class="na">bucketBy</span><span class="o">(</span><span class="mi">42</span><span class="o">,</span> <span class="s">"name"</span><span class="o">)</span>
<span class="o">.</span><span class="na">saveAsTable</span><span class="o">(</span><span class="s">"users_partitioned_bucketed"</span><span class="o">);</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">users_bucketed_and_partitioned</span><span class="p">(</span>
<span class="n">name</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_color</span> <span class="n">STRING</span><span class="p">,</span>
<span class="n">favorite_numbers</span> <span class="n">array</span><span class="o">&lt;</span><span class="nb">integer</span><span class="o">&gt;</span>
<span class="p">)</span> <span class="k">USING</span> <span class="n">parquet</span>
<span class="n">PARTITIONED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">favorite_color</span><span class="p">)</span>
<span class="n">CLUSTERED</span> <span class="k">BY</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="n">SORTED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">favorite_numbers</span><span class="p">)</span> <span class="k">INTO</span> <span class="mi">42</span> <span class="n">BUCKETS</span><span class="p">;</span></code></pre></figure>
</div>
</div>
<p><code class="language-plaintext highlighter-rouge">partitionBy</code> creates a directory structure as described in the <a href="sql-data-sources-parquet.html#partition-discovery">Partition Discovery</a> section.
Thus, it has limited applicability to columns with high cardinality. In contrast
<code class="language-plaintext highlighter-rouge">bucketBy</code> distributes
data across a fixed number of buckets and can be used when the number of unique values is unbounded.</p>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.5.5"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>