blob: 9f06bf1f897653af95b691e788a7867693dd9308 [file] [log] [blame]
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Spark Configuration</title>
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Jekyll v3.8.6">
<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
<link rel="stylesheet" href="/css/screen.css">
<link rel="icon" type="image/x-icon" href="/favicon.ico">
<!--[if lt IE 9]>
<script src="/js/html5shiv.min.js"></script>
<script src="/js/respond.min.js"></script>
<![endif]-->
</head>
<body class="wrap">
<header role="banner">
<nav class="mobile-nav show-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
<div class="grid">
<div class="unit one-quarter center-on-mobiles">
<h1>
<a href="/">
<span class="sr-only">Apache ORC</span>
<img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
</a>
</h1>
</div>
<nav class="main-nav unit three-quarters hide-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
</div>
</header>
<section class="docs">
<div class="grid">
<div class="docs-nav-mobile unit whole show-on-mobiles">
<select onchange="if (this.value) window.location.href=this.value">
<option value="">Navigate the docs…</option>
<optgroup label="Overview">
<option value="/docs/index.html">Background</option>
<option value="/docs/adopters.html">ORC Adopters</option>
<option value="/docs/types.html">Types</option>
<option value="/docs/indexes.html">Indexes</option>
<option value="/docs/acid.html">ACID support</option>
</optgroup>
<optgroup label="Installing">
<option value="/docs/building.html">Building ORC</option>
</optgroup>
<optgroup label="Using in Spark">
<option value="/docs/spark-ddl.html">Spark DDL</option>
<option value="/docs/spark-config.html">Spark Configuration</option>
</optgroup>
<optgroup label="Using in Python">
<option value="/docs/pyarrow.html">PyArrow</option>
<option value="/docs/dask.html">Dask</option>
</optgroup>
<optgroup label="Using in Hive">
<option value="/docs/hive-ddl.html">Hive DDL</option>
<option value="/docs/hive-config.html">Hive Configuration</option>
</optgroup>
<optgroup label="Using in MapReduce">
<option value="/docs/mapred.html">Using in MapRed</option>
<option value="/docs/mapreduce.html">Using in MapReduce</option>
</optgroup>
<optgroup label="Using ORC Core">
<option value="/docs/core-java.html">Using Core Java</option>
<option value="/docs/core-cpp.html">Using Core C++</option>
<option value="/docs/core-java-config.html">ORC Java configuration</option>
</optgroup>
<optgroup label="Tools">
<option value="/docs/cpp-tools.html">C++ Tools</option>
<option value="/docs/java-tools.html">Java Tools</option>
</optgroup>
</select>
</div>
<div class="unit four-fifths">
<article>
<h1>Spark Configuration</h1>
<h2 id="table-properties">Table properties</h2>
<p>Tables stored as ORC files use table properties to control their behavior. By
using table properties, the table owner ensures that all clients store data
with the same options.</p>
<table>
<thead>
<tr>
<th style="text-align: left">Key</th>
<th style="text-align: left">Default</th>
<th style="text-align: left">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left">orc.compress</td>
<td style="text-align: left">ZSTD</td>
<td style="text-align: left">high level compression = {NONE, ZLIB, SNAPPY, LZO, LZ4, ZSTD}</td>
</tr>
<tr>
<td style="text-align: left">orc.compress.size</td>
<td style="text-align: left">262,144</td>
<td style="text-align: left">compression chunk size</td>
</tr>
<tr>
<td style="text-align: left">orc.stripe.size</td>
<td style="text-align: left">67,108,864</td>
<td style="text-align: left">memory buffer in bytes for writing</td>
</tr>
<tr>
<td style="text-align: left">orc.row.index.stride</td>
<td style="text-align: left">10,000</td>
<td style="text-align: left">number of rows between index entries</td>
</tr>
<tr>
<td style="text-align: left">orc.create.index</td>
<td style="text-align: left">true</td>
<td style="text-align: left">whether the ORC writer create indexes as part of the file or not</td>
</tr>
<tr>
<td style="text-align: left">orc.bloom.filter.columns</td>
<td style="text-align: left">””</td>
<td style="text-align: left">comma separated list of column names</td>
</tr>
<tr>
<td style="text-align: left">orc.bloom.filter.fpp</td>
<td style="text-align: left">0.01</td>
<td style="text-align: left">bloom filter false positive rate</td>
</tr>
<tr>
<td style="text-align: left">orc.key.provider</td>
<td style="text-align: left">“hadoop”</td>
<td style="text-align: left">key provider</td>
</tr>
<tr>
<td style="text-align: left">orc.encrypt</td>
<td style="text-align: left">””</td>
<td style="text-align: left">list of keys and columns to encrypt with</td>
</tr>
<tr>
<td style="text-align: left">orc.mask</td>
<td style="text-align: left">””</td>
<td style="text-align: left">masks to apply to the encrypted columns</td>
</tr>
</tbody>
</table>
<p>For example, to create an ORC table with Zstandard compression:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>CREATE TABLE encrypted (
ssn STRING,
email STRING,
name STRING
)
USING ORC
OPTIONS (
hadoop.security.key.provider.path "kms://http@localhost:9600/kms",
orc.key.provider "hadoop",
orc.encrypt "pii:ssn,email",
orc.mask "nullify:ssn;sha256:email"
)
</code></pre></div></div>
<h2 id="configuration-properties">Configuration properties</h2>
<p>There are more Spark configuration properties related to ORC files:</p>
<table>
<thead>
<tr>
<th style="text-align: left">Key</th>
<th style="text-align: left">Default</th>
<th style="text-align: left">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left">spark.sql.orc.impl</td>
<td style="text-align: left">native</td>
<td style="text-align: left">The name of ORC implementation. It can be one of <code class="highlighter-rouge">native</code> or <code class="highlighter-rouge">hive</code>. <code class="highlighter-rouge">native</code> means the native ORC support. <code class="highlighter-rouge">hive</code> means the ORC library in Hive.</td>
</tr>
<tr>
<td style="text-align: left">spark.sql.orc.enableVectorizedReader</td>
<td style="text-align: left">true</td>
<td style="text-align: left">Enables vectorized orc decoding in <code class="highlighter-rouge">native</code> implementation.</td>
</tr>
<tr>
<td style="text-align: left">spark.sql.orc.mergeSchema</td>
<td style="text-align: left">false</td>
<td style="text-align: left">When true, the ORC data source merges schemas collected from all data files, otherwise the schema is picked from a random data file.</td>
</tr>
<tr>
<td style="text-align: left">spark.sql.hive.convertMetastoreOrc</td>
<td style="text-align: left">true</td>
<td style="text-align: left">Spark SQL will use the Hive SerDe for ORC tables instead of the built-in support.</td>
</tr>
</tbody>
</table>
<div class="section-nav">
<div class="left align-right">
<a href="/docs/spark-ddl.html" class="prev">Back</a>
</div>
<div class="right align-left">
<a href="/docs/pyarrow.html" class="next">Next</a>
</div>
</div>
<div class="clear"></div>
</article>
</div>
<div class="unit one-fifth hide-on-mobiles">
<aside>
<h4>Overview</h4>
<ul>
<li class=""><a href="/docs/index.html">Background</a></li>
<li class=""><a href="/docs/adopters.html">ORC Adopters</a></li>
<li class=""><a href="/docs/types.html">Types</a></li>
<li class=""><a href="/docs/indexes.html">Indexes</a></li>
<li class=""><a href="/docs/acid.html">ACID support</a></li>
</ul>
<h4>Installing</h4>
<ul>
<li class=""><a href="/docs/building.html">Building ORC</a></li>
</ul>
<h4>Using in Spark</h4>
<ul>
<li class=""><a href="/docs/spark-ddl.html">Spark DDL</a></li>
<li class="current"><a href="/docs/spark-config.html">Spark Configuration</a></li>
</ul>
<h4>Using in Python</h4>
<ul>
<li class=""><a href="/docs/pyarrow.html">PyArrow</a></li>
<li class=""><a href="/docs/dask.html">Dask</a></li>
</ul>
<h4>Using in Hive</h4>
<ul>
<li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li>
<li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li>
</ul>
<h4>Using in MapReduce</h4>
<ul>
<li class=""><a href="/docs/mapred.html">Using in MapRed</a></li>
<li class=""><a href="/docs/mapreduce.html">Using in MapReduce</a></li>
</ul>
<h4>Using ORC Core</h4>
<ul>
<li class=""><a href="/docs/core-java.html">Using Core Java</a></li>
<li class=""><a href="/docs/core-cpp.html">Using Core C++</a></li>
<li class=""><a href="/docs/core-java-config.html">ORC Java configuration</a></li>
</ul>
<h4>Tools</h4>
<ul>
<li class=""><a href="/docs/cpp-tools.html">C++ Tools</a></li>
<li class=""><a href="/docs/java-tools.html">Java Tools</a></li>
</ul>
</aside>
</div>
<div class="clear"></div>
</div>
</section>
<footer role="contentinfo">
<p style="margin-left: 20px; margin-right; 20px; text-align: center">The contents of this website are &copy;&nbsp;2024
<a href="https://www.apache.org/">Apache Software Foundation</a>
under the terms of the <a
href="https://www.apache.org/licenses/LICENSE-2.0.html">
Apache&nbsp;License&nbsp;v2</a>. Apache ORC and its logo are trademarks
of the Apache Software Foundation.</p>
</footer>
<script>
var anchorForId = function (id) {
var anchor = document.createElement("a");
anchor.className = "header-link";
anchor.href = "#" + id;
anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>";
anchor.title = "Permalink";
return anchor;
};
var linkifyAnchors = function (level, containingElement) {
var headers = containingElement.getElementsByTagName("h" + level);
for (var h = 0; h < headers.length; h++) {
var header = headers[h];
if (typeof header.id !== "undefined" && header.id !== "") {
header.appendChild(anchorForId(header.id));
}
}
};
document.onreadystatechange = function () {
if (this.readyState === "complete") {
var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0];
if (!contentBlock) {
return;
}
for (var level = 1; level <= 6; level++) {
linkifyAnchors(level, contentBlock);
}
}
};
</script>
</body>
</html>