blob: 098196c7f6cb27813092cba1d8e29275c79d0ea4 [file] [log] [blame]
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Using Core Java</title>
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Jekyll v3.8.6">
<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
<link rel="stylesheet" href="/css/screen.css">
<link rel="icon" type="image/x-icon" href="/favicon.ico">
<!--[if lt IE 9]>
<script src="/js/html5shiv.min.js"></script>
<script src="/js/respond.min.js"></script>
<![endif]-->
</head>
<body class="wrap">
<header role="banner">
<nav class="mobile-nav show-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
<div class="grid">
<div class="unit one-quarter center-on-mobiles">
<h1>
<a href="/">
<span class="sr-only">Apache ORC</span>
<img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
</a>
</h1>
</div>
<nav class="main-nav unit three-quarters hide-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
</div>
</header>
<section class="docs">
<div class="grid">
<div class="docs-nav-mobile unit whole show-on-mobiles">
<select onchange="if (this.value) window.location.href=this.value">
<option value="">Navigate the docs…</option>
<optgroup label="Overview">
<option value="/docs/index.html">Background</option>
<option value="/docs/adopters.html">ORC Adopters</option>
<option value="/docs/types.html">Types</option>
<option value="/docs/indexes.html">Indexes</option>
<option value="/docs/acid.html">ACID support</option>
</optgroup>
<optgroup label="Installing">
<option value="/docs/building.html">Building ORC</option>
</optgroup>
<optgroup label="Using in Spark">
<option value="/docs/spark-ddl.html">Spark DDL</option>
<option value="/docs/spark-config.html">Spark Configuration</option>
</optgroup>
<optgroup label="Using in Python">
<option value="/docs/pyarrow.html">PyArrow</option>
<option value="/docs/dask.html">Dask</option>
</optgroup>
<optgroup label="Using in Hive">
<option value="/docs/hive-ddl.html">Hive DDL</option>
<option value="/docs/hive-config.html">Hive Configuration</option>
</optgroup>
<optgroup label="Using in MapReduce">
<option value="/docs/mapred.html">Using in MapRed</option>
<option value="/docs/mapreduce.html">Using in MapReduce</option>
</optgroup>
<optgroup label="Using ORC Core">
<option value="/docs/core-java.html">Using Core Java</option>
<option value="/docs/core-cpp.html">Using Core C++</option>
<option value="/docs/core-java-config.html">ORC Java configuration</option>
</optgroup>
<optgroup label="Tools">
<option value="/docs/cpp-tools.html">C++ Tools</option>
<option value="/docs/java-tools.html">Java Tools</option>
</optgroup>
</select>
</div>
<div class="unit four-fifths">
<article>
<h1>Using Core Java</h1>
<p>The Core ORC API reads and writes ORC files into Hive’s storage-api
vectorized classes. Both Hive and MapReduce use the Core API to actually
read and write the data.</p>
<h2 id="vectorized-row-batch">Vectorized Row Batch</h2>
<p>Data is passed to ORC as instances of
<a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.html">VectorizedRowBatch</a>
that contain the data for 1024 rows. The focus is on speed and
accessing the data fields directly. <code class="highlighter-rouge">cols</code> is an array of
<a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.html">ColumnVector</a>
and <code class="highlighter-rouge">size</code> is the number of rows.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">package</span> <span class="nn">org.apache.hadoop.hive.ql.exec.vector</span><span class="o">;</span>
<span class="kd">public</span> <span class="kd">class</span> <span class="nc">VectorizedRowBatch</span> <span class="o">{</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span><span class="o">[]</span> <span class="n">cols</span><span class="o">;</span>
<span class="kd">public</span> <span class="kt">int</span> <span class="n">size</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.html">ColumnVector</a>
is the parent type of the different kinds of columns and has some
fields that are shared across all of the column types. In particular,
the <code class="highlighter-rouge">noNulls</code> flag if there are no nulls in this column for this batch
and the <code class="highlighter-rouge">isRepeating</code> flag for columns were the entire batch is the
same value. For columns where <code class="highlighter-rouge">noNulls == false</code> the <code class="highlighter-rouge">isNull</code> array is
true if that value is null.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">abstract</span> <span class="kd">class</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="c1">// If the whole column vector has no nulls, this is true, otherwise false.</span>
<span class="kd">public</span> <span class="kt">boolean</span> <span class="n">noNulls</span><span class="o">;</span>
<span class="c1">// If hasNulls is true, then this array contains true if the value is</span>
<span class="c1">// is null, otherwise false.</span>
<span class="kd">public</span> <span class="kt">boolean</span><span class="o">[]</span> <span class="n">isNull</span><span class="o">;</span>
<span class="cm">/*
* True if same value repeats for whole column vector.
* If so, vector[0] holds the repeating value.
*/</span>
<span class="kd">public</span> <span class="kt">boolean</span> <span class="n">isRepeating</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p>The subtypes of ColumnVector are:</p>
<table>
<thead>
<tr>
<th>ORC Type</th>
<th>ColumnVector</th>
</tr>
</thead>
<tbody>
<tr>
<td>array</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/ListColumnVector.html">ListColumnVector</a></td>
</tr>
<tr>
<td>binary</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.html">BytesColumnVector</a></td>
</tr>
<tr>
<td>bigint</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>boolean</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>char</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.html">BytesColumnVector</a></td>
</tr>
<tr>
<td>date</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>decimal</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.html">DecimalColumnVector</a></td>
</tr>
<tr>
<td>double</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.html">DoubleColumnVector</a></td>
</tr>
<tr>
<td>float</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.html">DoubleColumnVector</a></td>
</tr>
<tr>
<td>int</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>map</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/MapColumnVector.html">MapColumnVector</a></td>
</tr>
<tr>
<td>smallint</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>string</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.html">BytesColumnVector</a></td>
</tr>
<tr>
<td>struct</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/StructColumnVector.html">StructColumnVector</a></td>
</tr>
<tr>
<td>timestamp</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.html">TimestampColumnVector</a></td>
</tr>
<tr>
<td>tinyint</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a></td>
</tr>
<tr>
<td>uniontype</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/UnionColumnVector.html">UnionColumnVector</a></td>
</tr>
<tr>
<td>varchar</td>
<td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.html">BytesColumnVector</a></td>
</tr>
</tbody>
</table>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.html">LongColumnVector</a> handles all of the integer types (boolean, bigint,
date, int, smallint, and tinyint). The data is represented as an array of
longs where each value is sign-extended as necessary.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">LongColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">vector</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.html">TimestampColumnVector</a>
handles timestamp values. The data is represented as an array of longs
and an array of ints.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">TimestampColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="c1">// the number of milliseconds since 1 Jan 1970 00:00 GMT</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">time</span><span class="o">;</span>
<span class="c1">// the number of nanoseconds within the second</span>
<span class="kd">public</span> <span class="kt">int</span><span class="o">[]</span> <span class="n">nanos</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.html">DoubleColumnVector</a>
handles all of the floating point types (double, and float). The data
is represented as an array of doubles.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">DoubleColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="kt">double</span><span class="o">[]</span> <span class="n">vector</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/DecimalColumnVector.html">DecimalColumnVector</a>
handles decimal columns. The data is represented as an array of
HiveDecimalWritable. Note that this implementation is not performant
and will likely be replaced.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">DecimalColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="nc">HiveDecimalWritable</span><span class="o">[]</span> <span class="n">vector</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.html">BytesColumnVector</a>
handles all of the binary types (binary, char, string, and
varchar). The data is represented as a byte array, offset, and
length. The byte arrays may or may not be shared between values.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">BytesColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="kt">byte</span><span class="o">[][]</span> <span class="n">vector</span><span class="o">;</span>
<span class="kd">public</span> <span class="kt">int</span><span class="o">[]</span> <span class="n">start</span><span class="o">;</span>
<span class="kd">public</span> <span class="kt">int</span><span class="o">[]</span> <span class="n">length</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/StructColumnVector.html">StructColumnVector</a>
handles the struct columns and represents the data as an array of
<code class="highlighter-rouge">ColumnVector</code>. The value for row 5 consists of the fifth value from
each of the <code class="highlighter-rouge">fields</code> values.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">StructColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span><span class="o">[]</span> <span class="n">fields</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/UnionColumnVector.html">UnionColumnVector</a>
handles the union columns and represents the data as an array of
integers that pick the subtype and a <code class="highlighter-rouge">fields</code> array one per a
subtype. Only the value of the <code class="highlighter-rouge">fields</code> that corresponds to
<code class="highlighter-rouge">tags[row]</code> is set.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">UnionColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="kd">public</span> <span class="kt">int</span><span class="o">[]</span> <span class="n">tags</span><span class="o">;</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span><span class="o">[]</span> <span class="n">fields</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/ListColumnVector.html">ListColumnVector</a>
handles the array columns and represents the data as two arrays of
integers for the offset and lengths and a <code class="highlighter-rouge">ColumnVector</code> for the
children values.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">ListColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="c1">// for each row, the first offset of the child</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">offsets</span><span class="o">;</span>
<span class="c1">// for each row, the number of elements in the array</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">lengths</span><span class="o">;</span>
<span class="c1">// the offset in the child that should be used for new values</span>
<span class="kd">public</span> <span class="kt">int</span> <span class="n">childCount</span><span class="o">;</span>
<span class="c1">// the values of the children</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span> <span class="n">child</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<p><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/ql/exec/vector/MapColumnVector.html">MapColumnVector</a>
handles the map columns and represents the data as two arrays of
integers for the offset and lengths and two <code class="highlighter-rouge">ColumnVector</code>s for the
keys and values.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">class</span> <span class="nc">MapColumnVector</span> <span class="kd">extends</span> <span class="nc">ColumnVector</span> <span class="o">{</span>
<span class="c1">// for each row, the first offset of the child</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">offsets</span><span class="o">;</span>
<span class="c1">// for each row, the number of elements in the array</span>
<span class="kd">public</span> <span class="kt">long</span><span class="o">[]</span> <span class="n">lengths</span><span class="o">;</span>
<span class="c1">// the offset in the child that should be used for new values</span>
<span class="kd">public</span> <span class="kt">int</span> <span class="n">childCount</span><span class="o">;</span>
<span class="c1">// the values of the keys and values</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span> <span class="n">keys</span><span class="o">;</span>
<span class="kd">public</span> <span class="nc">ColumnVector</span> <span class="n">values</span><span class="o">;</span>
<span class="o">...</span>
<span class="o">}</span>
</code></pre></div></div>
<h2 id="writing-orc-files">Writing ORC Files</h2>
<h3 id="simple-example">Simple Example</h3>
<p>To write an ORC file, you need to define the schema and use the
<a href="/api/orc-core/index.html?org/apache/orc/OrcFile.html">OrcFile</a>
class to create a
<a href="/api/orc-core/index.html?org/apache/orc/Writer.html">Writer</a>
with the desired filename. This example sets the required schema
parameter, but there are many other options to control the ORC writer.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nc">Configuration</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Configuration</span><span class="o">();</span>
<span class="nc">TypeDescription</span> <span class="n">schema</span> <span class="o">=</span> <span class="nc">TypeDescription</span><span class="o">.</span><span class="na">fromString</span><span class="o">(</span><span class="s">"struct&lt;x:int,y:int&gt;"</span><span class="o">);</span>
<span class="nc">Writer</span> <span class="n">writer</span> <span class="o">=</span> <span class="nc">OrcFile</span><span class="o">.</span><span class="na">createWriter</span><span class="o">(</span><span class="k">new</span> <span class="nc">Path</span><span class="o">(</span><span class="s">"my-file.orc"</span><span class="o">),</span>
<span class="nc">OrcFile</span><span class="o">.</span><span class="na">writerOptions</span><span class="o">(</span><span class="n">conf</span><span class="o">)</span>
<span class="o">.</span><span class="na">setSchema</span><span class="o">(</span><span class="n">schema</span><span class="o">));</span>
</code></pre></div></div>
<p>Now you need to create a row batch, set the data, and write it to the file
as the batch fills up. When the file is done, close the <code class="highlighter-rouge">Writer</code>.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nc">VectorizedRowBatch</span> <span class="n">batch</span> <span class="o">=</span> <span class="n">schema</span><span class="o">.</span><span class="na">createRowBatch</span><span class="o">();</span>
<span class="nc">LongColumnVector</span> <span class="n">x</span> <span class="o">=</span> <span class="o">(</span><span class="nc">LongColumnVector</span><span class="o">)</span> <span class="n">batch</span><span class="o">.</span><span class="na">cols</span><span class="o">[</span><span class="mi">0</span><span class="o">];</span>
<span class="nc">LongColumnVector</span> <span class="n">y</span> <span class="o">=</span> <span class="o">(</span><span class="nc">LongColumnVector</span><span class="o">)</span> <span class="n">batch</span><span class="o">.</span><span class="na">cols</span><span class="o">[</span><span class="mi">1</span><span class="o">];</span>
<span class="k">for</span><span class="o">(</span><span class="kt">int</span> <span class="n">r</span><span class="o">=</span><span class="mi">0</span><span class="o">;</span> <span class="n">r</span> <span class="o">&lt;</span> <span class="mi">10000</span><span class="o">;</span> <span class="o">++</span><span class="n">r</span><span class="o">)</span> <span class="o">{</span>
<span class="kt">int</span> <span class="n">row</span> <span class="o">=</span> <span class="n">batch</span><span class="o">.</span><span class="na">size</span><span class="o">++;</span>
<span class="n">x</span><span class="o">.</span><span class="na">vector</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="n">r</span><span class="o">;</span>
<span class="n">y</span><span class="o">.</span><span class="na">vector</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="mi">3</span><span class="o">;</span>
<span class="c1">// If the batch is full, write it out and start over.</span>
<span class="k">if</span> <span class="o">(</span><span class="n">batch</span><span class="o">.</span><span class="na">size</span> <span class="o">==</span> <span class="n">batch</span><span class="o">.</span><span class="na">getMaxSize</span><span class="o">())</span> <span class="o">{</span>
<span class="n">writer</span><span class="o">.</span><span class="na">addRowBatch</span><span class="o">(</span><span class="n">batch</span><span class="o">);</span>
<span class="n">batch</span><span class="o">.</span><span class="na">reset</span><span class="o">();</span>
<span class="o">}</span>
<span class="o">}</span>
<span class="k">if</span> <span class="o">(</span><span class="n">batch</span><span class="o">.</span><span class="na">size</span> <span class="o">!=</span> <span class="mi">0</span><span class="o">)</span> <span class="o">{</span>
<span class="n">writer</span><span class="o">.</span><span class="na">addRowBatch</span><span class="o">(</span><span class="n">batch</span><span class="o">);</span>
<span class="n">batch</span><span class="o">.</span><span class="na">reset</span><span class="o">();</span>
<span class="o">}</span>
<span class="n">writer</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
</code></pre></div></div>
<h3 id="advanced-example">Advanced Example</h3>
<p>The following example writes an ORC file with two integer
columns and a map column. Each row’s map has 5 elements with keys
ranging from “&lt;row&gt;.0” to “&lt;row&gt;.4”.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nc">Path</span> <span class="n">testFilePath</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Path</span><span class="o">(</span><span class="s">"advanced-example.orc"</span><span class="o">);</span>
<span class="nc">Configuration</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Configuration</span><span class="o">();</span>
<span class="nc">TypeDescription</span> <span class="n">schema</span> <span class="o">=</span>
<span class="nc">TypeDescription</span><span class="o">.</span><span class="na">fromString</span><span class="o">(</span><span class="s">"struct&lt;first:int,"</span> <span class="o">+</span>
<span class="s">"second:int,third:map&lt;string,int&gt;&gt;"</span><span class="o">);</span>
<span class="nc">Writer</span> <span class="n">writer</span> <span class="o">=</span>
<span class="nc">OrcFile</span><span class="o">.</span><span class="na">createWriter</span><span class="o">(</span><span class="n">testFilePath</span><span class="o">,</span>
<span class="nc">OrcFile</span><span class="o">.</span><span class="na">writerOptions</span><span class="o">(</span><span class="n">conf</span><span class="o">).</span><span class="na">setSchema</span><span class="o">(</span><span class="n">schema</span><span class="o">));</span>
<span class="nc">VectorizedRowBatch</span> <span class="n">batch</span> <span class="o">=</span> <span class="n">schema</span><span class="o">.</span><span class="na">createRowBatch</span><span class="o">();</span>
<span class="nc">LongColumnVector</span> <span class="n">first</span> <span class="o">=</span> <span class="o">(</span><span class="nc">LongColumnVector</span><span class="o">)</span> <span class="n">batch</span><span class="o">.</span><span class="na">cols</span><span class="o">[</span><span class="mi">0</span><span class="o">];</span>
<span class="nc">LongColumnVector</span> <span class="n">second</span> <span class="o">=</span> <span class="o">(</span><span class="nc">LongColumnVector</span><span class="o">)</span> <span class="n">batch</span><span class="o">.</span><span class="na">cols</span><span class="o">[</span><span class="mi">1</span><span class="o">];</span>
<span class="c1">//Define map. You need also to cast the key and value vectors</span>
<span class="nc">MapColumnVector</span> <span class="n">map</span> <span class="o">=</span> <span class="o">(</span><span class="nc">MapColumnVector</span><span class="o">)</span> <span class="n">batch</span><span class="o">.</span><span class="na">cols</span><span class="o">[</span><span class="mi">2</span><span class="o">];</span>
<span class="nc">BytesColumnVector</span> <span class="n">mapKey</span> <span class="o">=</span> <span class="o">(</span><span class="nc">BytesColumnVector</span><span class="o">)</span> <span class="n">map</span><span class="o">.</span><span class="na">keys</span><span class="o">;</span>
<span class="nc">LongColumnVector</span> <span class="n">mapValue</span> <span class="o">=</span> <span class="o">(</span><span class="nc">LongColumnVector</span><span class="o">)</span> <span class="n">map</span><span class="o">.</span><span class="na">values</span><span class="o">;</span>
<span class="c1">// Each map has 5 elements</span>
<span class="kd">final</span> <span class="kt">int</span> <span class="no">MAP_SIZE</span> <span class="o">=</span> <span class="mi">5</span><span class="o">;</span>
<span class="kd">final</span> <span class="kt">int</span> <span class="no">BATCH_SIZE</span> <span class="o">=</span> <span class="n">batch</span><span class="o">.</span><span class="na">getMaxSize</span><span class="o">();</span>
<span class="c1">// Ensure the map is big enough</span>
<span class="n">mapKey</span><span class="o">.</span><span class="na">ensureSize</span><span class="o">(</span><span class="no">BATCH_SIZE</span> <span class="o">*</span> <span class="no">MAP_SIZE</span><span class="o">,</span> <span class="kc">false</span><span class="o">);</span>
<span class="n">mapValue</span><span class="o">.</span><span class="na">ensureSize</span><span class="o">(</span><span class="no">BATCH_SIZE</span> <span class="o">*</span> <span class="no">MAP_SIZE</span><span class="o">,</span> <span class="kc">false</span><span class="o">);</span>
<span class="c1">// add 1500 rows to file</span>
<span class="k">for</span><span class="o">(</span><span class="kt">int</span> <span class="n">r</span><span class="o">=</span><span class="mi">0</span><span class="o">;</span> <span class="n">r</span> <span class="o">&lt;</span> <span class="mi">1500</span><span class="o">;</span> <span class="o">++</span><span class="n">r</span><span class="o">)</span> <span class="o">{</span>
<span class="kt">int</span> <span class="n">row</span> <span class="o">=</span> <span class="n">batch</span><span class="o">.</span><span class="na">size</span><span class="o">++;</span>
<span class="n">first</span><span class="o">.</span><span class="na">vector</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="n">r</span><span class="o">;</span>
<span class="n">second</span><span class="o">.</span><span class="na">vector</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="n">r</span> <span class="o">*</span> <span class="mi">3</span><span class="o">;</span>
<span class="n">map</span><span class="o">.</span><span class="na">offsets</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="n">map</span><span class="o">.</span><span class="na">childCount</span><span class="o">;</span>
<span class="n">map</span><span class="o">.</span><span class="na">lengths</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">=</span> <span class="no">MAP_SIZE</span><span class="o">;</span>
<span class="n">map</span><span class="o">.</span><span class="na">childCount</span> <span class="o">+=</span> <span class="no">MAP_SIZE</span><span class="o">;</span>
<span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">mapElem</span> <span class="o">=</span> <span class="o">(</span><span class="kt">int</span><span class="o">)</span> <span class="n">map</span><span class="o">.</span><span class="na">offsets</span><span class="o">[</span><span class="n">row</span><span class="o">];</span>
<span class="n">mapElem</span> <span class="o">&lt;</span> <span class="n">map</span><span class="o">.</span><span class="na">offsets</span><span class="o">[</span><span class="n">row</span><span class="o">]</span> <span class="o">+</span> <span class="no">MAP_SIZE</span><span class="o">;</span> <span class="o">++</span><span class="n">mapElem</span><span class="o">)</span> <span class="o">{</span>
<span class="nc">String</span> <span class="n">key</span> <span class="o">=</span> <span class="s">"row "</span> <span class="o">+</span> <span class="n">r</span> <span class="o">+</span> <span class="s">"."</span> <span class="o">+</span> <span class="o">(</span><span class="n">mapElem</span> <span class="o">-</span> <span class="n">map</span><span class="o">.</span><span class="na">offsets</span><span class="o">[</span><span class="n">row</span><span class="o">]);</span>
<span class="n">mapKey</span><span class="o">.</span><span class="na">setVal</span><span class="o">(</span><span class="n">mapElem</span><span class="o">,</span> <span class="n">key</span><span class="o">.</span><span class="na">getBytes</span><span class="o">(</span><span class="nc">StandardCharsets</span><span class="o">.</span><span class="na">UTF_8</span><span class="o">));</span>
<span class="n">mapValue</span><span class="o">.</span><span class="na">vector</span><span class="o">[</span><span class="n">mapElem</span><span class="o">]</span> <span class="o">=</span> <span class="n">mapElem</span><span class="o">;</span>
<span class="o">}</span>
<span class="k">if</span> <span class="o">(</span><span class="n">row</span> <span class="o">==</span> <span class="no">BATCH_SIZE</span> <span class="o">-</span> <span class="mi">1</span><span class="o">)</span> <span class="o">{</span>
<span class="n">writer</span><span class="o">.</span><span class="na">addRowBatch</span><span class="o">(</span><span class="n">batch</span><span class="o">);</span>
<span class="n">batch</span><span class="o">.</span><span class="na">reset</span><span class="o">();</span>
<span class="o">}</span>
<span class="o">}</span>
<span class="k">if</span> <span class="o">(</span><span class="n">batch</span><span class="o">.</span><span class="na">size</span> <span class="o">!=</span> <span class="mi">0</span><span class="o">)</span> <span class="o">{</span>
<span class="n">writer</span><span class="o">.</span><span class="na">addRowBatch</span><span class="o">(</span><span class="n">batch</span><span class="o">);</span>
<span class="n">batch</span><span class="o">.</span><span class="na">reset</span><span class="o">();</span>
<span class="o">}</span>
<span class="n">writer</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
</code></pre></div></div>
<h2 id="reading-orc-files">Reading ORC Files</h2>
<p>To read ORC files, use the
<a href="/api/orc-core/index.html?org/apache/orc/OrcFile.html">OrcFile</a>
class to create a
<a href="/api/orc-core/index.html?org/apache/orc/Reader.html">Reader</a>
that contains the metadata about the file. There are a few options to
the ORC reader, but far fewer than the writer and none of them are
required. The reader has methods for getting the number of rows,
schema, compression, etc. from the file.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nc">Reader</span> <span class="n">reader</span> <span class="o">=</span> <span class="nc">OrcFile</span><span class="o">.</span><span class="na">createReader</span><span class="o">(</span><span class="k">new</span> <span class="nc">Path</span><span class="o">(</span><span class="s">"my-file.orc"</span><span class="o">),</span>
<span class="nc">OrcFile</span><span class="o">.</span><span class="na">readerOptions</span><span class="o">(</span><span class="n">conf</span><span class="o">));</span>
</code></pre></div></div>
<p>To get the data, create a
<a href="/api/orc-core/index.html?org/apache/orc/RecordReader.html">RecordReader</a>
object. By default, the RecordReader reads all rows and all columns,
but there are options to control the data that is read.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nc">RecordReader</span> <span class="n">rows</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="na">rows</span><span class="o">();</span>
<span class="nc">VectorizedRowBatch</span> <span class="n">batch</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="na">getSchema</span><span class="o">().</span><span class="na">createRowBatch</span><span class="o">();</span>
</code></pre></div></div>
<p>With a <code class="highlighter-rouge">RecordReader</code> the user can ask for the next batch until there
are no more left. The reader will stop the batch at certain boundaries, so the
returned batch may not be full, but it will always contain some rows.</p>
<div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">while</span> <span class="o">(</span><span class="n">rows</span><span class="o">.</span><span class="na">nextBatch</span><span class="o">(</span><span class="n">batch</span><span class="o">))</span> <span class="o">{</span>
<span class="k">for</span><span class="o">(</span><span class="kt">int</span> <span class="n">r</span><span class="o">=</span><span class="mi">0</span><span class="o">;</span> <span class="n">r</span> <span class="o">&lt;</span> <span class="n">batch</span><span class="o">.</span><span class="na">size</span><span class="o">;</span> <span class="o">++</span><span class="n">r</span><span class="o">)</span> <span class="o">{</span>
<span class="o">...</span> <span class="n">process</span> <span class="n">row</span> <span class="n">r</span> <span class="n">from</span> <span class="n">batch</span>
<span class="o">}</span>
<span class="o">}</span>
<span class="n">rows</span><span class="o">.</span><span class="na">close</span><span class="o">();</span>
</code></pre></div></div>
<div class="section-nav">
<div class="left align-right">
<a href="/docs/mapreduce.html" class="prev">Back</a>
</div>
<div class="right align-left">
<a href="/docs/core-cpp.html" class="next">Next</a>
</div>
</div>
<div class="clear"></div>
</article>
</div>
<div class="unit one-fifth hide-on-mobiles">
<aside>
<h4>Overview</h4>
<ul>
<li class=""><a href="/docs/index.html">Background</a></li>
<li class=""><a href="/docs/adopters.html">ORC Adopters</a></li>
<li class=""><a href="/docs/types.html">Types</a></li>
<li class=""><a href="/docs/indexes.html">Indexes</a></li>
<li class=""><a href="/docs/acid.html">ACID support</a></li>
</ul>
<h4>Installing</h4>
<ul>
<li class=""><a href="/docs/building.html">Building ORC</a></li>
</ul>
<h4>Using in Spark</h4>
<ul>
<li class=""><a href="/docs/spark-ddl.html">Spark DDL</a></li>
<li class=""><a href="/docs/spark-config.html">Spark Configuration</a></li>
</ul>
<h4>Using in Python</h4>
<ul>
<li class=""><a href="/docs/pyarrow.html">PyArrow</a></li>
<li class=""><a href="/docs/dask.html">Dask</a></li>
</ul>
<h4>Using in Hive</h4>
<ul>
<li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li>
<li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li>
</ul>
<h4>Using in MapReduce</h4>
<ul>
<li class=""><a href="/docs/mapred.html">Using in MapRed</a></li>
<li class=""><a href="/docs/mapreduce.html">Using in MapReduce</a></li>
</ul>
<h4>Using ORC Core</h4>
<ul>
<li class="current"><a href="/docs/core-java.html">Using Core Java</a></li>
<li class=""><a href="/docs/core-cpp.html">Using Core C++</a></li>
<li class=""><a href="/docs/core-java-config.html">ORC Java configuration</a></li>
</ul>
<h4>Tools</h4>
<ul>
<li class=""><a href="/docs/cpp-tools.html">C++ Tools</a></li>
<li class=""><a href="/docs/java-tools.html">Java Tools</a></li>
</ul>
</aside>
</div>
<div class="clear"></div>
</div>
</section>
<footer role="contentinfo">
<p style="margin-left: 20px; margin-right; 20px; text-align: center">The contents of this website are &copy;&nbsp;2024
<a href="https://www.apache.org/">Apache Software Foundation</a>
under the terms of the <a
href="https://www.apache.org/licenses/LICENSE-2.0.html">
Apache&nbsp;License&nbsp;v2</a>. Apache ORC and its logo are trademarks
of the Apache Software Foundation.</p>
</footer>
<script>
var anchorForId = function (id) {
var anchor = document.createElement("a");
anchor.className = "header-link";
anchor.href = "#" + id;
anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>";
anchor.title = "Permalink";
return anchor;
};
var linkifyAnchors = function (level, containingElement) {
var headers = containingElement.getElementsByTagName("h" + level);
for (var h = 0; h < headers.length; h++) {
var header = headers[h];
if (typeof header.id !== "undefined" && header.id !== "") {
header.appendChild(anchorForId(header.id));
}
}
};
document.onreadystatechange = function () {
if (this.readyState === "complete") {
var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0];
if (!contentBlock) {
return;
}
for (var level = 1; level <= 6; level++) {
linkifyAnchors(level, contentBlock);
}
}
};
</script>
</body>
</html>