blob: 7b731f1a812cd2e18c4b91ef35a43740993a42d1 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Parquet Files - Spark 3.5.0 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.0</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.0</span></span>-->
</div>
</nav>
<div class="container">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<ul>
<li>
<a href="sql-data-sources-load-save-functions.html">
Generic Load/Save Functions
</a>
</li>
<li>
<a href="sql-data-sources-generic-options.html">
Generic File Source Options
</a>
</li>
<li>
<a href="sql-data-sources-parquet.html">
Parquet Files
</a>
</li>
<li>
<a href="sql-data-sources-orc.html">
ORC Files
</a>
</li>
<li>
<a href="sql-data-sources-json.html">
JSON Files
</a>
</li>
<li>
<a href="sql-data-sources-csv.html">
CSV Files
</a>
</li>
<li>
<a href="sql-data-sources-text.html">
Text Files
</a>
</li>
<li>
<a href="sql-data-sources-hive-tables.html">
Hive Tables
</a>
</li>
<li>
<a href="sql-data-sources-jdbc.html">
JDBC To Other Databases
</a>
</li>
<li>
<a href="sql-data-sources-avro.html">
Avro Files
</a>
</li>
<li>
<a href="sql-data-sources-protobuf.html">
Protobuf data
</a>
</li>
<li>
<a href="sql-data-sources-binaryFile.html">
Whole Binary Files
</a>
</li>
<li>
<a href="sql-data-sources-troubleshooting.html">
Troubleshooting
</a>
</li>
</ul>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-guide.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<li>
<a href="sql-error-conditions.html">
Error Conditions
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Parquet Files</h1>
<ul id="markdown-toc">
<li><a href="#loading-data-programmatically" id="markdown-toc-loading-data-programmatically">Loading Data Programmatically</a></li>
<li><a href="#partition-discovery" id="markdown-toc-partition-discovery">Partition Discovery</a></li>
<li><a href="#schema-merging" id="markdown-toc-schema-merging">Schema Merging</a></li>
<li><a href="#hive-metastore-parquet-table-conversion" id="markdown-toc-hive-metastore-parquet-table-conversion">Hive metastore Parquet table conversion</a> <ul>
<li><a href="#hiveparquet-schema-reconciliation" id="markdown-toc-hiveparquet-schema-reconciliation">Hive/Parquet Schema Reconciliation</a></li>
<li><a href="#metadata-refreshing" id="markdown-toc-metadata-refreshing">Metadata Refreshing</a></li>
</ul>
</li>
<li><a href="#columnar-encryption" id="markdown-toc-columnar-encryption">Columnar Encryption</a> <ul>
<li><a href="#kms-client" id="markdown-toc-kms-client">KMS Client</a></li>
</ul>
</li>
<li><a href="#data-source-option" id="markdown-toc-data-source-option">Data Source Option</a> <ul>
<li><a href="#configuration" id="markdown-toc-configuration">Configuration</a></li>
</ul>
</li>
</ul>
<p><a href="https://parquet.apache.org">Parquet</a> is a columnar format that is supported by many other data processing systems.
Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema
of the original data. When reading Parquet files, all columns are automatically converted to be nullable for
compatibility reasons.</p>
<h3 id="loading-data-programmatically">Loading Data Programmatically</h3>
<p>Using the data from the above example:</p>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="n">peopleDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">json</span><span class="p">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="p">)</span>
<span class="c1"># DataFrames can be saved as Parquet files, maintaining the schema information.
</span><span class="n">peopleDF</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"people.parquet"</span><span class="p">)</span>
<span class="c1"># Read in the Parquet file created above.
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
</span><span class="n">parquetFile</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"people.parquet"</span><span class="p">)</span>
<span class="c1"># Parquet files can also be used to create a temporary view and then used in SQL statements.
</span><span class="n">parquetFile</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"parquetFile"</span><span class="p">)</span>
<span class="n">teenagers</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19"</span><span class="p">)</span>
<span class="n">teenagers</span><span class="p">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +------+
# | name|
# +------+
# |Justin|
# +------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// Encoders for most common types are automatically provided by importing spark.implicits._</span>
<span class="k">import</span> <span class="nn">spark.implicits._</span>
<span class="k">val</span> <span class="nv">peopleDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">json</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">)</span>
<span class="c1">// DataFrames can be saved as Parquet files, maintaining the schema information</span>
<span class="nv">peopleDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="s">"people.parquet"</span><span class="o">)</span>
<span class="c1">// Read in the parquet file created above</span>
<span class="c1">// Parquet files are self-describing so the schema is preserved</span>
<span class="c1">// The result of loading a Parquet file is also a DataFrame</span>
<span class="k">val</span> <span class="nv">parquetFileDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="s">"people.parquet"</span><span class="o">)</span>
<span class="c1">// Parquet files can also be used to create a temporary view and then used in SQL statements</span>
<span class="nv">parquetFileDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"parquetFile"</span><span class="o">)</span>
<span class="k">val</span> <span class="nv">namesDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"</span><span class="o">)</span>
<span class="nv">namesDF</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">attributes</span> <span class="k">=&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="nf">attributes</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="py">show</span><span class="o">()</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.MapFunction</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoders</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">peopleDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="s">"examples/src/main/resources/people.json"</span><span class="o">);</span>
<span class="c1">// DataFrames can be saved as Parquet files, maintaining the schema information</span>
<span class="n">peopleDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">"people.parquet"</span><span class="o">);</span>
<span class="c1">// Read in the Parquet file created above.</span>
<span class="c1">// Parquet files are self-describing so the schema is preserved</span>
<span class="c1">// The result of loading a parquet file is also a DataFrame</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">parquetFileDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">"people.parquet"</span><span class="o">);</span>
<span class="c1">// Parquet files can also be used to create a temporary view and then used in SQL statements</span>
<span class="n">parquetFileDF</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"parquetFile"</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">namesDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">String</span><span class="o">&gt;</span> <span class="n">namesDS</span> <span class="o">=</span> <span class="n">namesDF</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
<span class="o">(</span><span class="nc">MapFunction</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">&gt;)</span> <span class="n">row</span> <span class="o">-&gt;</span> <span class="s">"Name: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span>
<span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">());</span>
<span class="n">namesDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
<span class="c1">// +------------+</span>
<span class="c1">// | value|</span>
<span class="c1">// +------------+</span>
<span class="c1">// |Name: Justin|</span>
<span class="c1">// +------------+</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"examples/src/main/resources/people.json"</span><span class="p">,</span><span class="w"> </span><span class="s2">"json"</span><span class="p">)</span><span class="w">
</span><span class="c1"># SparkDataFrame can be saved as Parquet files, maintaining the schema information.</span><span class="w">
</span><span class="n">write.parquet</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="s2">"people.parquet"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span><span class="w">
</span><span class="c1"># The result of loading a parquet file is also a DataFrame.</span><span class="w">
</span><span class="n">parquetFile</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.parquet</span><span class="p">(</span><span class="s2">"people.parquet"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Parquet files can also be used to create a temporary view and then used in SQL statements.</span><span class="w">
</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="n">parquetFile</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquetFile"</span><span class="p">)</span><span class="w">
</span><span class="n">teenagers</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">sql</span><span class="p">(</span><span class="s2">"SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19"</span><span class="p">)</span><span class="w">
</span><span class="n">head</span><span class="p">(</span><span class="n">teenagers</span><span class="p">)</span><span class="w">
</span><span class="c1">## name</span><span class="w">
</span><span class="c1">## 1 Justin</span><span class="w">
</span><span class="c1"># We can also run custom R-UDFs on Spark DataFrames. Here we prefix all the names with "Name:"</span><span class="w">
</span><span class="n">schema</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">structType</span><span class="p">(</span><span class="n">structField</span><span class="p">(</span><span class="s2">"name"</span><span class="p">,</span><span class="w"> </span><span class="s2">"string"</span><span class="p">))</span><span class="w">
</span><span class="n">teenNames</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">dapply</span><span class="p">(</span><span class="n">df</span><span class="p">,</span><span class="w"> </span><span class="k">function</span><span class="p">(</span><span class="n">p</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">cbind</span><span class="p">(</span><span class="n">paste</span><span class="p">(</span><span class="s2">"Name:"</span><span class="p">,</span><span class="w"> </span><span class="n">p</span><span class="o">$</span><span class="n">name</span><span class="p">))</span><span class="w"> </span><span class="p">},</span><span class="w"> </span><span class="n">schema</span><span class="p">)</span><span class="w">
</span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">teenName</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="n">collect</span><span class="p">(</span><span class="n">teenNames</span><span class="p">)</span><span class="o">$</span><span class="n">name</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="n">cat</span><span class="p">(</span><span class="n">teenName</span><span class="p">,</span><span class="w"> </span><span class="s2">"\n"</span><span class="p">)</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="c1">## Name: Michael</span><span class="w">
</span><span class="c1">## Name: Andy</span><span class="w">
</span><span class="c1">## Name: Justin</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TEMPORARY</span> <span class="k">VIEW</span> <span class="n">parquetTable</span>
<span class="k">USING</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">spark</span><span class="p">.</span><span class="k">sql</span><span class="p">.</span><span class="n">parquet</span>
<span class="k">OPTIONS</span> <span class="p">(</span>
<span class="n">path</span> <span class="nv">"examples/src/main/resources/people.parquet"</span>
<span class="p">)</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">parquetTable</span></code></pre></figure>
</div>
</div>
<h3 id="partition-discovery">Partition Discovery</h3>
<p>Table partitioning is a common optimization approach used in systems like Hive. In a partitioned
table, data are usually stored in different directories, with partitioning column values encoded in
the path of each partition directory. All built-in file sources (including Text/CSV/JSON/ORC/Parquet)
are able to discover and infer partitioning information automatically.
For example, we can store all our previously used
population data into a partitioned table using the following directory structure, with two extra
columns, <code class="language-plaintext highlighter-rouge">gender</code> and <code class="language-plaintext highlighter-rouge">country</code> as partitioning columns:</p>
<figure class="highlight"><pre><code class="language-text" data-lang="text">path
└── to
└── table
├── gender=male
│   ├── ...
│   │
│   ├── country=US
│   │   └── data.parquet
│   ├── country=CN
│   │   └── data.parquet
│   └── ...
└── gender=female
   ├── ...
   │
   ├── country=US
   │   └── data.parquet
   ├── country=CN
   │   └── data.parquet
   └── ...</code></pre></figure>
<p>By passing <code class="language-plaintext highlighter-rouge">path/to/table</code> to either <code class="language-plaintext highlighter-rouge">SparkSession.read.parquet</code> or <code class="language-plaintext highlighter-rouge">SparkSession.read.load</code>, Spark SQL
will automatically extract the partitioning information from the paths.
Now the schema of the returned DataFrame becomes:</p>
<figure class="highlight"><pre><code class="language-text" data-lang="text">root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- country: string (nullable = true)</code></pre></figure>
<p>Notice that the data types of the partitioning columns are automatically inferred. Currently,
numeric data types, date, timestamp and string type are supported. Sometimes users may not want
to automatically infer the data types of the partitioning columns. For these use cases, the
automatic type inference can be configured by
<code class="language-plaintext highlighter-rouge">spark.sql.sources.partitionColumnTypeInference.enabled</code>, which is default to <code class="language-plaintext highlighter-rouge">true</code>. When type
inference is disabled, string type will be used for the partitioning columns.</p>
<p>Starting from Spark 1.6.0, partition discovery only finds partitions under the given paths
by default. For the above example, if users pass <code class="language-plaintext highlighter-rouge">path/to/table/gender=male</code> to either
<code class="language-plaintext highlighter-rouge">SparkSession.read.parquet</code> or <code class="language-plaintext highlighter-rouge">SparkSession.read.load</code>, <code class="language-plaintext highlighter-rouge">gender</code> will not be considered as a
partitioning column. If users need to specify the base path that partition discovery
should start with, they can set <code class="language-plaintext highlighter-rouge">basePath</code> in the data source options. For example,
when <code class="language-plaintext highlighter-rouge">path/to/table/gender=male</code> is the path of the data and
users set <code class="language-plaintext highlighter-rouge">basePath</code> to <code class="language-plaintext highlighter-rouge">path/to/table/</code>, <code class="language-plaintext highlighter-rouge">gender</code> will be a partitioning column.</p>
<h3 id="schema-merging">Schema Merging</h3>
<p>Like Protocol Buffer, Avro, and Thrift, Parquet also supports schema evolution. Users can start with
a simple schema, and gradually add more columns to the schema as needed. In this way, users may end
up with multiple Parquet files with different but mutually compatible schemas. The Parquet data
source is now able to automatically detect this case and merge schemas of all these files.</p>
<p>Since schema merging is a relatively expensive operation, and is not a necessity in most cases, we
turned it off by default starting from 1.5.0. You may enable it by</p>
<ol>
<li>setting data source option <code class="language-plaintext highlighter-rouge">mergeSchema</code> to <code class="language-plaintext highlighter-rouge">true</code> when reading Parquet files (as shown in the
examples below), or</li>
<li>setting the global SQL option <code class="language-plaintext highlighter-rouge">spark.sql.parquet.mergeSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</li>
</ol>
<div class="codetabs">
<div data-lang="python">
<div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="c1"># spark is from the previous example.
# Create a simple DataFrame, stored into a partition directory
</span><span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sparkContext</span>
<span class="n">squaresDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">sc</span><span class="p">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">))</span>
<span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="n">i</span><span class="p">,</span> <span class="n">double</span><span class="o">=</span><span class="n">i</span> <span class="o">**</span> <span class="mi">2</span><span class="p">)))</span>
<span class="n">squaresDF</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"data/test_table/key=1"</span><span class="p">)</span>
<span class="c1"># Create another DataFrame in a new partition directory,
# adding a new column and dropping an existing column
</span><span class="n">cubesDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">sc</span><span class="p">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">11</span><span class="p">))</span>
<span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="n">i</span><span class="p">,</span> <span class="n">triple</span><span class="o">=</span><span class="n">i</span> <span class="o">**</span> <span class="mi">3</span><span class="p">)))</span>
<span class="n">cubesDF</span><span class="p">.</span><span class="n">write</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"data/test_table/key=2"</span><span class="p">)</span>
<span class="c1"># Read the partitioned table
</span><span class="n">mergedDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"mergeSchema"</span><span class="p">,</span> <span class="s">"true"</span><span class="p">).</span><span class="n">parquet</span><span class="p">(</span><span class="s">"data/test_table"</span><span class="p">)</span>
<span class="n">mergedDF</span><span class="p">.</span><span class="n">printSchema</span><span class="p">()</span>
<span class="c1"># The final schema consists of all 3 columns in the Parquet files together
# with the partitioning column appeared in the partition directory paths.
# root
# |-- double: long (nullable = true)
# |-- single: long (nullable = true)
# |-- triple: long (nullable = true)
# |-- key: integer (nullable = true)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div>
</div>
<div data-lang="scala">
<div class="highlight"><pre class="codehilite"><code><span class="c1">// This is used to implicitly convert an RDD to a DataFrame.</span>
<span class="k">import</span> <span class="nn">spark.implicits._</span>
<span class="c1">// Create a simple DataFrame, store into a partition directory</span>
<span class="k">val</span> <span class="nv">squaresDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sparkContext</span><span class="o">.</span><span class="py">makeRDD</span><span class="o">(</span><span class="mi">1</span> <span class="n">to</span> <span class="mi">5</span><span class="o">).</span><span class="py">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span><span class="o">)).</span><span class="py">toDF</span><span class="o">(</span><span class="s">"value"</span><span class="o">,</span> <span class="s">"square"</span><span class="o">)</span>
<span class="nv">squaresDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="s">"data/test_table/key=1"</span><span class="o">)</span>
<span class="c1">// Create another DataFrame in a new partition directory,</span>
<span class="c1">// adding a new column and dropping an existing column</span>
<span class="k">val</span> <span class="nv">cubesDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sparkContext</span><span class="o">.</span><span class="py">makeRDD</span><span class="o">(</span><span class="mi">6</span> <span class="n">to</span> <span class="mi">10</span><span class="o">).</span><span class="py">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span> <span class="o">*</span> <span class="n">i</span><span class="o">)).</span><span class="py">toDF</span><span class="o">(</span><span class="s">"value"</span><span class="o">,</span> <span class="s">"cube"</span><span class="o">)</span>
<span class="nv">cubesDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="s">"data/test_table/key=2"</span><span class="o">)</span>
<span class="c1">// Read the partitioned table</span>
<span class="k">val</span> <span class="nv">mergedDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"mergeSchema"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">).</span><span class="py">parquet</span><span class="o">(</span><span class="s">"data/test_table"</span><span class="o">)</span>
<span class="nv">mergedDF</span><span class="o">.</span><span class="py">printSchema</span><span class="o">()</span>
<span class="c1">// The final schema consists of all 3 columns in the Parquet files together</span>
<span class="c1">// with the partitioning column appeared in the partition directory paths</span>
<span class="c1">// root</span>
<span class="c1">// |-- value: int (nullable = true)</span>
<span class="c1">// |-- square: int (nullable = true)</span>
<span class="c1">// |-- cube: int (nullable = true)</span>
<span class="c1">// |-- key: int (nullable = true)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div>
</div>
<div data-lang="java">
<div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.io.Serializable</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
<span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Square</span> <span class="kd">implements</span> <span class="nc">Serializable</span> <span class="o">{</span>
<span class="kd">private</span> <span class="kt">int</span> <span class="n">value</span><span class="o">;</span>
<span class="kd">private</span> <span class="kt">int</span> <span class="n">square</span><span class="o">;</span>
<span class="c1">// Getters and setters...</span>
<span class="o">}</span>
<span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Cube</span> <span class="kd">implements</span> <span class="nc">Serializable</span> <span class="o">{</span>
<span class="kd">private</span> <span class="kt">int</span> <span class="n">value</span><span class="o">;</span>
<span class="kd">private</span> <span class="kt">int</span> <span class="n">cube</span><span class="o">;</span>
<span class="c1">// Getters and setters...</span>
<span class="o">}</span>
<span class="nc">List</span><span class="o">&lt;</span><span class="nc">Square</span><span class="o">&gt;</span> <span class="n">squares</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ArrayList</span><span class="o">&lt;&gt;();</span>
<span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">1</span><span class="o">;</span> <span class="n">value</span> <span class="o">&lt;=</span> <span class="mi">5</span><span class="o">;</span> <span class="n">value</span><span class="o">++)</span> <span class="o">{</span>
<span class="nc">Square</span> <span class="n">square</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Square</span><span class="o">();</span>
<span class="n">square</span><span class="o">.</span><span class="na">setValue</span><span class="o">(</span><span class="n">value</span><span class="o">);</span>
<span class="n">square</span><span class="o">.</span><span class="na">setSquare</span><span class="o">(</span><span class="n">value</span> <span class="o">*</span> <span class="n">value</span><span class="o">);</span>
<span class="n">squares</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">square</span><span class="o">);</span>
<span class="o">}</span>
<span class="c1">// Create a simple DataFrame, store into a partition directory</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">squaresDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">squares</span><span class="o">,</span> <span class="nc">Square</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
<span class="n">squaresDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">"data/test_table/key=1"</span><span class="o">);</span>
<span class="nc">List</span><span class="o">&lt;</span><span class="nc">Cube</span><span class="o">&gt;</span> <span class="n">cubes</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ArrayList</span><span class="o">&lt;&gt;();</span>
<span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">6</span><span class="o">;</span> <span class="n">value</span> <span class="o">&lt;=</span> <span class="mi">10</span><span class="o">;</span> <span class="n">value</span><span class="o">++)</span> <span class="o">{</span>
<span class="nc">Cube</span> <span class="n">cube</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Cube</span><span class="o">();</span>
<span class="n">cube</span><span class="o">.</span><span class="na">setValue</span><span class="o">(</span><span class="n">value</span><span class="o">);</span>
<span class="n">cube</span><span class="o">.</span><span class="na">setCube</span><span class="o">(</span><span class="n">value</span> <span class="o">*</span> <span class="n">value</span> <span class="o">*</span> <span class="n">value</span><span class="o">);</span>
<span class="n">cubes</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">cube</span><span class="o">);</span>
<span class="o">}</span>
<span class="c1">// Create another DataFrame in a new partition directory,</span>
<span class="c1">// adding a new column and dropping an existing column</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">cubesDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">cubes</span><span class="o">,</span> <span class="nc">Cube</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
<span class="n">cubesDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">"data/test_table/key=2"</span><span class="o">);</span>
<span class="c1">// Read the partitioned table</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">mergedDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">option</span><span class="o">(</span><span class="s">"mergeSchema"</span><span class="o">,</span> <span class="kc">true</span><span class="o">).</span><span class="na">parquet</span><span class="o">(</span><span class="s">"data/test_table"</span><span class="o">);</span>
<span class="n">mergedDF</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
<span class="c1">// The final schema consists of all 3 columns in the Parquet files together</span>
<span class="c1">// with the partitioning column appeared in the partition directory paths</span>
<span class="c1">// root</span>
<span class="c1">// |-- value: int (nullable = true)</span>
<span class="c1">// |-- square: int (nullable = true)</span>
<span class="c1">// |-- cube: int (nullable = true)</span>
<span class="c1">// |-- key: int (nullable = true)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div>
</div>
<div data-lang="r">
<div class="highlight"><pre class="codehilite"><code><span class="n">df1</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data.frame</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="nf">c</span><span class="p">(</span><span class="m">12</span><span class="p">,</span><span class="w"> </span><span class="m">29</span><span class="p">),</span><span class="w"> </span><span class="n">double</span><span class="o">=</span><span class="nf">c</span><span class="p">(</span><span class="m">19</span><span class="p">,</span><span class="w"> </span><span class="m">23</span><span class="p">)))</span><span class="w">
</span><span class="n">df2</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data.frame</span><span class="p">(</span><span class="n">double</span><span class="o">=</span><span class="nf">c</span><span class="p">(</span><span class="m">19</span><span class="p">,</span><span class="w"> </span><span class="m">23</span><span class="p">),</span><span class="w"> </span><span class="n">triple</span><span class="o">=</span><span class="nf">c</span><span class="p">(</span><span class="m">23</span><span class="p">,</span><span class="w"> </span><span class="m">18</span><span class="p">)))</span><span class="w">
</span><span class="c1"># Create a simple DataFrame, stored into a partition directory</span><span class="w">
</span><span class="n">write.df</span><span class="p">(</span><span class="n">df1</span><span class="p">,</span><span class="w"> </span><span class="s2">"data/test_table/key=1"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="s2">"overwrite"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Create another DataFrame in a new partition directory,</span><span class="w">
</span><span class="c1"># adding a new column and dropping an existing column</span><span class="w">
</span><span class="n">write.df</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span><span class="w"> </span><span class="s2">"data/test_table/key=2"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="s2">"overwrite"</span><span class="p">)</span><span class="w">
</span><span class="c1"># Read the partitioned table</span><span class="w">
</span><span class="n">df3</span><span class="w"> </span><span class="o">&lt;-</span><span class="w"> </span><span class="n">read.df</span><span class="p">(</span><span class="s2">"data/test_table"</span><span class="p">,</span><span class="w"> </span><span class="s2">"parquet"</span><span class="p">,</span><span class="w"> </span><span class="n">mergeSchema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"true"</span><span class="p">)</span><span class="w">
</span><span class="n">printSchema</span><span class="p">(</span><span class="n">df3</span><span class="p">)</span><span class="w">
</span><span class="c1"># The final schema consists of all 3 columns in the Parquet files together</span><span class="w">
</span><span class="c1"># with the partitioning column appeared in the partition directory paths</span><span class="w">
</span><span class="c1">## root</span><span class="w">
</span><span class="c1">## |-- single: double (nullable = true)</span><span class="w">
</span><span class="c1">## |-- double: double (nullable = true)</span><span class="w">
</span><span class="c1">## |-- triple: double (nullable = true)</span><span class="w">
</span><span class="c1">## |-- key: integer (nullable = true)</span></code></pre></div>
<div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div>
</div>
</div>
<h3 id="hive-metastore-parquet-table-conversion">Hive metastore Parquet table conversion</h3>
<p>When reading from Hive metastore Parquet tables and writing to non-partitioned Hive metastore
Parquet tables, Spark SQL will try to use its own Parquet support instead of Hive SerDe for
better performance. This behavior is controlled by the <code class="language-plaintext highlighter-rouge">spark.sql.hive.convertMetastoreParquet</code>
configuration, and is turned on by default.</p>
<h4 id="hiveparquet-schema-reconciliation">Hive/Parquet Schema Reconciliation</h4>
<p>There are two key differences between Hive and Parquet from the perspective of table schema
processing.</p>
<ol>
<li>Hive is case insensitive, while Parquet is not</li>
<li>Hive considers all columns nullable, while nullability in Parquet is significant</li>
</ol>
<p>Due to this reason, we must reconcile Hive metastore schema with Parquet schema when converting a
Hive metastore Parquet table to a Spark SQL Parquet table. The reconciliation rules are:</p>
<ol>
<li>
<p>Fields that have the same name in both schema must have the same data type regardless of
nullability. The reconciled field should have the data type of the Parquet side, so that
nullability is respected.</p>
</li>
<li>
<p>The reconciled schema contains exactly those fields defined in Hive metastore schema.</p>
<ul>
<li>Any fields that only appear in the Parquet schema are dropped in the reconciled schema.</li>
<li>Any fields that only appear in the Hive metastore schema are added as nullable field in the
reconciled schema.</li>
</ul>
</li>
</ol>
<h4 id="metadata-refreshing">Metadata Refreshing</h4>
<p>Spark SQL caches Parquet metadata for better performance. When Hive metastore Parquet table
conversion is enabled, metadata of those converted tables are also cached. If these tables are
updated by Hive or other external tools, you need to refresh them manually to ensure consistent
metadata.</p>
<div class="codetabs">
<div data-lang="python">
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="c1"># spark is an existing SparkSession
</span><span class="n">spark</span><span class="p">.</span><span class="n">catalog</span><span class="p">.</span><span class="n">refreshTable</span><span class="p">(</span><span class="s">"my_table"</span><span class="p">)</span></code></pre></figure>
</div>
<div data-lang="scala">
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// spark is an existing SparkSession</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">catalog</span><span class="o">.</span><span class="py">refreshTable</span><span class="o">(</span><span class="s">"my_table"</span><span class="o">)</span></code></pre></figure>
</div>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// spark is an existing SparkSession</span>
<span class="n">spark</span><span class="o">.</span><span class="na">catalog</span><span class="o">().</span><span class="na">refreshTable</span><span class="o">(</span><span class="s">"my_table"</span><span class="o">);</span></code></pre></figure>
</div>
<div data-lang="r">
<figure class="highlight"><pre><code class="language-r" data-lang="r"><span class="n">refreshTable</span><span class="p">(</span><span class="s2">"my_table"</span><span class="p">)</span></code></pre></figure>
</div>
<div data-lang="SQL">
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="n">REFRESH</span> <span class="k">TABLE</span> <span class="n">my_table</span><span class="p">;</span></code></pre></figure>
</div>
</div>
<h2 id="columnar-encryption">Columnar Encryption</h2>
<p>Since Spark 3.2, columnar encryption is supported for Parquet tables with Apache Parquet 1.12+.</p>
<p>Parquet uses the envelope encryption practice, where file parts are encrypted with &#8220;data encryption keys&#8221; (DEKs), and the DEKs are encrypted with &#8220;master encryption keys&#8221; (MEKs). The DEKs are randomly generated by Parquet for each encrypted file/column. The MEKs are generated, stored and managed in a Key Management Service (KMS) of user’s choice. The Parquet Maven <a href="https://repo1.maven.org/maven2/org/apache/parquet/parquet-hadoop/1.13.1/">repository</a> has a jar with a mock KMS implementation that allows to run column encryption and decryption using a spark-shell only, without deploying a KMS server (download the <code class="language-plaintext highlighter-rouge">parquet-hadoop-tests.jar</code> file and place it in the Spark <code class="language-plaintext highlighter-rouge">jars</code> folder):</p>
<div class="codetabs">
<div data-lang="python">
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="c1"># Set hadoop configuration properties, e.g. using configuration properties of
# the Spark job:
# --conf spark.hadoop.parquet.encryption.kms.client.class=\
# "org.apache.parquet.crypto.keytools.mocks.InMemoryKMS"\
# --conf spark.hadoop.parquet.encryption.key.list=\
# "keyA:AAECAwQFBgcICQoLDA0ODw== , keyB:AAECAAECAAECAAECAAECAA=="\
# --conf spark.hadoop.parquet.crypto.factory.class=\
# "org.apache.parquet.crypto.keytools.PropertiesDrivenCryptoFactory"
</span>
<span class="c1"># Write encrypted dataframe files.
# Column "square" will be protected with master key "keyA".
# Parquet file footers will be protected with master key "keyB"
</span><span class="n">squaresDF</span><span class="p">.</span><span class="n">write</span>\
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.encryption.column.keys"</span> <span class="p">,</span> <span class="s">"keyA:square"</span><span class="p">)</span>\
<span class="p">.</span><span class="n">option</span><span class="p">(</span><span class="s">"parquet.encryption.footer.key"</span> <span class="p">,</span> <span class="s">"keyB"</span><span class="p">)</span>\
<span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="p">)</span>
<span class="c1"># Read encrypted dataframe files
</span><span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="p">)</span></code></pre></figure>
</div>
<div data-lang="scala">
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="nv">sc</span><span class="o">.</span><span class="py">hadoopConfiguration</span><span class="o">.</span><span class="py">set</span><span class="o">(</span><span class="s">"parquet.encryption.kms.client.class"</span> <span class="o">,</span>
<span class="s">"org.apache.parquet.crypto.keytools.mocks.InMemoryKMS"</span><span class="o">)</span>
<span class="c1">// Explicit master keys (base64 encoded) - required only for mock InMemoryKMS</span>
<span class="nv">sc</span><span class="o">.</span><span class="py">hadoopConfiguration</span><span class="o">.</span><span class="py">set</span><span class="o">(</span><span class="s">"parquet.encryption.key.list"</span> <span class="o">,</span>
<span class="s">"keyA:AAECAwQFBgcICQoLDA0ODw== , keyB:AAECAAECAAECAAECAAECAA=="</span><span class="o">)</span>
<span class="c1">// Activate Parquet encryption, driven by Hadoop properties</span>
<span class="nv">sc</span><span class="o">.</span><span class="py">hadoopConfiguration</span><span class="o">.</span><span class="py">set</span><span class="o">(</span><span class="s">"parquet.crypto.factory.class"</span> <span class="o">,</span>
<span class="s">"org.apache.parquet.crypto.keytools.PropertiesDrivenCryptoFactory"</span><span class="o">)</span>
<span class="c1">// Write encrypted dataframe files.</span>
<span class="c1">// Column "square" will be protected with master key "keyA".</span>
<span class="c1">// Parquet file footers will be protected with master key "keyB"</span>
<span class="nv">squaresDF</span><span class="o">.</span><span class="py">write</span><span class="o">.</span>
<span class="nf">option</span><span class="o">(</span><span class="s">"parquet.encryption.column.keys"</span> <span class="o">,</span> <span class="s">"keyA:square"</span><span class="o">).</span>
<span class="nf">option</span><span class="o">(</span><span class="s">"parquet.encryption.footer.key"</span> <span class="o">,</span> <span class="s">"keyB"</span><span class="o">).</span>
<span class="nf">parquet</span><span class="o">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="o">)</span>
<span class="c1">// Read encrypted dataframe files</span>
<span class="k">val</span> <span class="nv">df2</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="o">)</span></code></pre></figure>
</div>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">sc</span><span class="o">.</span><span class="na">hadoopConfiguration</span><span class="o">().</span><span class="na">set</span><span class="o">(</span><span class="s">"parquet.encryption.kms.client.class"</span> <span class="o">,</span>
<span class="s">"org.apache.parquet.crypto.keytools.mocks.InMemoryKMS"</span><span class="o">);</span>
<span class="c1">// Explicit master keys (base64 encoded) - required only for mock InMemoryKMS</span>
<span class="n">sc</span><span class="o">.</span><span class="na">hadoopConfiguration</span><span class="o">().</span><span class="na">set</span><span class="o">(</span><span class="s">"parquet.encryption.key.list"</span> <span class="o">,</span>
<span class="s">"keyA:AAECAwQFBgcICQoLDA0ODw== , keyB:AAECAAECAAECAAECAAECAA=="</span><span class="o">);</span>
<span class="c1">// Activate Parquet encryption, driven by Hadoop properties</span>
<span class="n">sc</span><span class="o">.</span><span class="na">hadoopConfiguration</span><span class="o">().</span><span class="na">set</span><span class="o">(</span><span class="s">"parquet.crypto.factory.class"</span> <span class="o">,</span>
<span class="s">"org.apache.parquet.crypto.keytools.PropertiesDrivenCryptoFactory"</span><span class="o">);</span>
<span class="c1">// Write encrypted dataframe files.</span>
<span class="c1">// Column "square" will be protected with master key "keyA".</span>
<span class="c1">// Parquet file footers will be protected with master key "keyB"</span>
<span class="n">squaresDF</span><span class="o">.</span><span class="na">write</span><span class="o">().</span>
<span class="n">option</span><span class="o">(</span><span class="s">"parquet.encryption.column.keys"</span> <span class="o">,</span> <span class="s">"keyA:square"</span><span class="o">).</span>
<span class="n">option</span><span class="o">(</span><span class="s">"parquet.encryption.footer.key"</span> <span class="o">,</span> <span class="s">"keyB"</span><span class="o">).</span>
<span class="n">parquet</span><span class="o">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="o">);</span>
<span class="c1">// Read encrypted dataframe files</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">df2</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">"/path/to/table.parquet.encrypted"</span><span class="o">);</span></code></pre></figure>
</div>
</div>
<h4 id="kms-client">KMS Client</h4>
<p>The InMemoryKMS class is provided only for illustration and simple demonstration of Parquet encryption functionality. <strong>It should not be used in a real deployment</strong>. The master encryption keys must be kept and managed in a production-grade KMS system, deployed in user&#8217;s organization. Rollout of Spark with Parquet encryption requires implementation of a client class for the KMS server. Parquet provides a plug-in <a href="https://github.com/apache/parquet-mr/blob/apache-parquet-1.13.1/parquet-hadoop/src/main/java/org/apache/parquet/crypto/keytools/KmsClient.java">interface</a> for development of such classes,</p>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="kd">public</span> <span class="kd">interface</span> <span class="nc">KmsClient</span> <span class="o">{</span>
<span class="c1">// Wraps a key - encrypts it with the master key.</span>
<span class="kd">public</span> <span class="nc">String</span> <span class="nf">wrapKey</span><span class="o">(</span><span class="kt">byte</span><span class="o">[]</span> <span class="n">keyBytes</span><span class="o">,</span> <span class="nc">String</span> <span class="n">masterKeyIdentifier</span><span class="o">);</span>
<span class="c1">// Decrypts (unwraps) a key with the master key.</span>
<span class="kd">public</span> <span class="kt">byte</span><span class="o">[]</span> <span class="nf">unwrapKey</span><span class="o">(</span><span class="nc">String</span> <span class="n">wrappedKey</span><span class="o">,</span> <span class="nc">String</span> <span class="n">masterKeyIdentifier</span><span class="o">);</span>
<span class="c1">// Use of initialization parameters is optional.</span>
<span class="kd">public</span> <span class="kt">void</span> <span class="nf">initialize</span><span class="o">(</span><span class="nc">Configuration</span> <span class="n">configuration</span><span class="o">,</span> <span class="nc">String</span> <span class="n">kmsInstanceID</span><span class="o">,</span>
<span class="nc">String</span> <span class="n">kmsInstanceURL</span><span class="o">,</span> <span class="nc">String</span> <span class="n">accessToken</span><span class="o">);</span>
<span class="o">}</span></code></pre></figure>
</div>
<p>An <a href="https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/src/test/java/org/apache/parquet/crypto/keytools/samples/VaultClient.java">example</a> of such class for an open source <a href="https://www.vaultproject.io/api/secret/transit">KMS</a> can be found in the parquet-mr repository. The production KMS client should be designed in cooperation with organization&#8217;s security administrators, and built by developers with an experience in access control management. Once such class is created, it can be passed to applications via the <code class="language-plaintext highlighter-rouge">parquet.encryption.kms.client.class</code> parameter and leveraged by general Spark users as shown in the encrypted dataframe write/read sample above.</p>
<p>Note: By default, Parquet implements a &#8220;double envelope encryption&#8221; mode, that minimizes the interaction of Spark executors with a KMS server. In this mode, the DEKs are encrypted with &#8220;key encryption keys&#8221; (KEKs, randomly generated by Parquet). The KEKs are encrypted with MEKs in KMS; the result and the KEK itself are cached in Spark executor memory. Users interested in regular envelope encryption, can switch to it by setting the <code class="language-plaintext highlighter-rouge">parquet.encryption.double.wrapping</code> parameter to <code class="language-plaintext highlighter-rouge">false</code>. For more details on Parquet encryption parameters, visit the parquet-hadoop configuration <a href="https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md#class-propertiesdrivencryptofactory">page</a>.</p>
<h2 id="data-source-option">Data Source Option</h2>
<p>Data source options of Parquet can be set via:</p>
<ul>
<li>the <code class="language-plaintext highlighter-rouge">.option</code>/<code class="language-plaintext highlighter-rouge">.options</code> methods of
<ul>
<li><code class="language-plaintext highlighter-rouge">DataFrameReader</code></li>
<li><code class="language-plaintext highlighter-rouge">DataFrameWriter</code></li>
<li><code class="language-plaintext highlighter-rouge">DataStreamReader</code></li>
<li><code class="language-plaintext highlighter-rouge">DataStreamWriter</code></li>
</ul>
</li>
<li><code class="language-plaintext highlighter-rouge">OPTIONS</code> clause at <a href="sql-ref-syntax-ddl-create-table-datasource.html">CREATE TABLE USING DATA_SOURCE</a></li>
</ul>
<table class="table table-striped">
<thead><tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th><th><b>Scope</b></th></tr></thead>
<tr>
<td><code>datetimeRebaseMode</code></td>
<td>(value of <code>spark.sql.parquet.datetimeRebaseModeInRead</code> configuration)</td>
<td>The <code>datetimeRebaseMode</code> option allows to specify the rebasing mode for the values of the <code>DATE</code>, <code>TIMESTAMP_MILLIS</code>, <code>TIMESTAMP_MICROS</code> logical types from the Julian to Proleptic Gregorian calendar.<br />
Currently supported modes are:
<ul>
<li><code>EXCEPTION</code>: fails in reads of ancient dates/timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: loads dates/timestamps without rebasing.</li>
<li><code>LEGACY</code>: performs rebasing of ancient dates/timestamps from the Julian to Proleptic Gregorian calendar.</li>
</ul>
</td>
<td>read</td>
</tr>
<tr>
<td><code>int96RebaseMode</code></td>
<td>(value of <code>spark.sql.parquet.int96RebaseModeInRead</code> configuration)</td>
<td>The <code>int96RebaseMode</code> option allows to specify the rebasing mode for INT96 timestamps from the Julian to Proleptic Gregorian calendar.<br />
Currently supported modes are:
<ul>
<li><code>EXCEPTION</code>: fails in reads of ancient INT96 timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: loads INT96 timestamps without rebasing.</li>
<li><code>LEGACY</code>: performs rebasing of ancient timestamps from the Julian to Proleptic Gregorian calendar.</li>
</ul>
</td>
<td>read</td>
</tr>
<tr>
<td><code>mergeSchema</code></td>
<td>(value of <code>spark.sql.parquet.mergeSchema</code> configuration)</td>
<td>Sets whether we should merge schemas collected from all Parquet part-files. This will override <code>spark.sql.parquet.mergeSchema</code>.</td>
<td>read</td>
</tr>
<tr>
<td><code>compression</code></td>
<td><code>snappy</code></td>
<td>Compression codec to use when saving to file. This can be one of the known case-insensitive shorten names (none, uncompressed, snappy, gzip, lzo, brotli, lz4, and zstd). This will override <code>spark.sql.parquet.compression.codec</code>.</td>
<td>write</td>
</tr>
</table>
<p>Other generic options can be found in <a href="https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html"> Generic Files Source Options</a></p>
<h3 id="configuration">Configuration</h3>
<p>Configuration of Parquet can be done using the <code class="language-plaintext highlighter-rouge">setConf</code> method on <code class="language-plaintext highlighter-rouge">SparkSession</code> or by running
<code class="language-plaintext highlighter-rouge">SET key=value</code> commands using SQL.</p>
<table class="table table-striped">
<thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead>
<tr>
<td><code>spark.sql.parquet.binaryAsString</code></td>
<td>false</td>
<td>
Some other Parquet-producing systems, in particular Impala, Hive, and older versions of Spark SQL, do
not differentiate between binary data and strings when writing out the Parquet schema. This
flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
</td>
<td>1.1.1</td>
</tr>
<tr>
<td><code>spark.sql.parquet.int96AsTimestamp</code></td>
<td>true</td>
<td>
Some Parquet-producing systems, in particular Impala and Hive, store Timestamp into INT96. This
flag tells Spark SQL to interpret INT96 data as a timestamp to provide compatibility with these systems.
</td>
<td>1.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.int96TimestampConversion</code></td>
<td>false</td>
<td>
This controls whether timestamp adjustments should be applied to INT96 data when
converting to timestamps, for data written by Impala. This is necessary because Impala
stores INT96 data with a different timezone offset than Hive &amp; Spark.
</td>
<td>2.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.outputTimestampType</code></td>
<td>INT96</td>
<td>
Sets which Parquet timestamp type to use when Spark writes data to Parquet files.
INT96 is a non-standard but commonly used timestamp type in Parquet. TIMESTAMP_MICROS
is a standard timestamp type in Parquet, which stores number of microseconds from the
Unix epoch. TIMESTAMP_MILLIS is also standard, but with millisecond precision, which
means Spark has to truncate the microsecond portion of its timestamp value.
</td>
<td>2.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.compression.codec</code></td>
<td>snappy</td>
<td>
Sets the compression codec used when writing Parquet files. If either <code>compression</code> or
<code>parquet.compression</code> is specified in the table-specific options/properties, the precedence would be
<code>compression</code>, <code>parquet.compression</code>, <code>spark.sql.parquet.compression.codec</code>. Acceptable values include:
none, uncompressed, snappy, gzip, lzo, brotli, lz4, zstd.
Note that <code>brotli</code> requires <code>BrotliCodec</code> to be installed.
</td>
<td>1.1.1</td>
</tr>
<tr>
<td><code>spark.sql.parquet.filterPushdown</code></td>
<td>true</td>
<td>Enables Parquet filter push-down optimization when set to true.</td>
<td>1.2.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.aggregatePushdown</code></td>
<td>false</td>
<td>
If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX
and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date
type. For COUNT, support all data types. If statistics is missing from any Parquet file
footer, exception would be thrown.
</td>
<td>3.3.0</td>
</tr>
<tr>
<td><code>spark.sql.hive.convertMetastoreParquet</code></td>
<td>true</td>
<td>
When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of the built in
support.
</td>
<td>1.1.1</td>
</tr>
<tr>
<td><code>spark.sql.parquet.mergeSchema</code></td>
<td>false</td>
<td>
<p>
When true, the Parquet data source merges schemas collected from all data files, otherwise the
schema is picked from the summary file or a random data file if no summary file is available.
</p>
</td>
<td>1.5.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.respectSummaryFiles</code></td>
<td>false</td>
<td>
When true, we make assumption that all part-files of Parquet are consistent with
summary files and we will ignore them when merging schema. Otherwise, if this is
false, which is the default, we will merge all part-files. This should be considered
as expert-only option, and shouldn't be enabled before knowing what it means exactly.
</td>
<td>1.5.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.writeLegacyFormat</code></td>
<td>false</td>
<td>
If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal values
will be written in Apache Parquet's fixed-length byte array format, which other systems such as
Apache Hive and Apache Impala use. If false, the newer format in Parquet will be used. For
example, decimals will be written in int-based format. If Parquet output is intended for use
with systems that do not support this newer format, set to true.
</td>
<td>1.6.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.enableVectorizedReader</code></td>
<td>true</td>
<td>
Enables vectorized parquet decoding.
</td>
<td>2.0.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.enableNestedColumnVectorizedReader</code></td>
<td>true</td>
<td>
Enables vectorized Parquet decoding for nested columns (e.g., struct, list, map).
Requires <code>spark.sql.parquet.enableVectorizedReader</code> to be enabled.
</td>
<td>3.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.recordLevelFilter.enabled</code></td>
<td>false</td>
<td>
If true, enables Parquet's native record-level filtering using the pushed down filters.
This configuration only has an effect when <code>spark.sql.parquet.filterPushdown</code>
is enabled and the vectorized reader is not used. You can ensure the vectorized reader
is not used by setting <code>spark.sql.parquet.enableVectorizedReader</code> to false.
</td>
<td>2.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.columnarReaderBatchSize</code></td>
<td>4096</td>
<td>
The number of rows to include in a parquet vectorized reader batch. The number should
be carefully chosen to minimize overhead and avoid OOMs in reading data.
</td>
<td>2.4.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.fieldId.write.enabled</code></td>
<td>true</td>
<td>
Field ID is a native field of the Parquet schema spec. When enabled,
Parquet writers will populate the field Id metadata (if present) in the Spark schema to the Parquet schema.
</td>
<td>3.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.fieldId.read.enabled</code></td>
<td>false</td>
<td>
Field ID is a native field of the Parquet schema spec. When enabled, Parquet readers
will use field IDs (if present) in the requested Spark schema to look up Parquet
fields instead of using column names.
</td>
<td>3.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.fieldId.read.ignoreMissing</code></td>
<td>false</td>
<td>
When the Parquet file doesn't have any field IDs but the
Spark read schema is using field IDs to read, we will silently return nulls
when this flag is enabled, or error otherwise.
</td>
<td>3.3.0</td>
</tr>
<tr>
<td><code>spark.sql.parquet.timestampNTZ.enabled</code></td>
<td>true</td>
<td>
Enables <code>TIMESTAMP_NTZ</code> support for Parquet reads and writes.
When enabled, <code>TIMESTAMP_NTZ</code> values are written as Parquet timestamp
columns with annotation isAdjustedToUTC = false and are inferred in a similar way.
When disabled, such values are read as <code>TIMESTAMP_LTZ</code> and have to be
converted to <code>TIMESTAMP_LTZ</code> for writes.
</td>
<td>3.4.0</td>
</tr>
<tr>
<td>spark.sql.parquet.datetimeRebaseModeInRead</td>
<td><code>EXCEPTION</code></td>
<td>The rebasing mode for the values of the <code>DATE</code>, <code>TIMESTAMP_MILLIS</code>, <code>TIMESTAMP_MICROS</code> logical types from the Julian to Proleptic Gregorian calendar:<br />
<ul>
<li><code>EXCEPTION</code>: Spark will fail the reading if it sees ancient dates/timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: Spark will not do rebase and read the dates/timestamps as it is.</li>
<li><code>LEGACY</code>: Spark will rebase dates/timestamps from the legacy hybrid (Julian + Gregorian) calendar to Proleptic Gregorian calendar when reading Parquet files.</li>
</ul>
This config is only effective if the writer info (like Spark, Hive) of the Parquet files is unknown.
</td>
<td>3.0.0</td>
</tr>
<tr>
<td>spark.sql.parquet.datetimeRebaseModeInWrite</td>
<td><code>EXCEPTION</code></td>
<td>The rebasing mode for the values of the <code>DATE</code>, <code>TIMESTAMP_MILLIS</code>, <code>TIMESTAMP_MICROS</code> logical types from the Proleptic Gregorian to Julian calendar:<br />
<ul>
<li><code>EXCEPTION</code>: Spark will fail the writing if it sees ancient dates/timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: Spark will not do rebase and write the dates/timestamps as it is.</li>
<li><code>LEGACY</code>: Spark will rebase dates/timestamps from Proleptic Gregorian calendar to the legacy hybrid (Julian + Gregorian) calendar when writing Parquet files.</li>
</ul>
</td>
<td>3.0.0</td>
</tr>
<tr>
<td>spark.sql.parquet.int96RebaseModeInRead</td>
<td><code>EXCEPTION</code></td>
<td>The rebasing mode for the values of the <code>INT96</code> timestamp type from the Julian to Proleptic Gregorian calendar:<br />
<ul>
<li><code>EXCEPTION</code>: Spark will fail the reading if it sees ancient INT96 timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: Spark will not do rebase and read the dates/timestamps as it is.</li>
<li><code>LEGACY</code>: Spark will rebase INT96 timestamps from the legacy hybrid (Julian + Gregorian) calendar to Proleptic Gregorian calendar when reading Parquet files.</li>
</ul>
This config is only effective if the writer info (like Spark, Hive) of the Parquet files is unknown.
</td>
<td>3.1.0</td>
</tr>
<tr>
<td>spark.sql.parquet.int96RebaseModeInWrite</td>
<td><code>EXCEPTION</code></td>
<td>The rebasing mode for the values of the <code>INT96</code> timestamp type from the Proleptic Gregorian to Julian calendar:<br />
<ul>
<li><code>EXCEPTION</code>: Spark will fail the writing if it sees ancient timestamps that are ambiguous between the two calendars.</li>
<li><code>CORRECTED</code>: Spark will not do rebase and write the dates/timestamps as it is.</li>
<li><code>LEGACY</code>: Spark will rebase INT96 timestamps from Proleptic Gregorian calendar to the legacy hybrid (Julian + Gregorian) calendar when writing Parquet files.</li>
</ul>
</td>
<td>3.1.0</td>
</tr>
</table>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.5.0"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>