blob: 38362f048106decb05c81b10e195666faf079abd [file] [log] [blame]
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Using Core C++</title>
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta name="generator" content="Jekyll v3.8.6">
<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
<link rel="stylesheet" href="/css/screen.css">
<link rel="icon" type="image/x-icon" href="/favicon.ico">
<!--[if lt IE 9]>
<script src="/js/html5shiv.min.js"></script>
<script src="/js/respond.min.js"></script>
<![endif]-->
</head>
<body class="wrap">
<header role="banner">
<nav class="mobile-nav show-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
<div class="grid">
<div class="unit one-quarter center-on-mobiles">
<h1>
<a href="/">
<span class="sr-only">Apache ORC</span>
<img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
</a>
</h1>
</div>
<nav class="main-nav unit three-quarters hide-on-mobiles">
<ul>
<li class="">
<a href="/">Home</a>
</li>
<li class="">
<a href="/releases/"><span class="show-on-mobiles">Rel</span>
<span class="hide-on-mobiles">Releases</span></a>
</li>
<li class="current">
<a href="/docs/"><span class="show-on-mobiles">Doc</span>
<span class="hide-on-mobiles">Documentation</span></a>
</li>
<li class="">
<a href="/talks/"><span class="show-on-mobiles">Talk</span>
<span class="hide-on-mobiles">Talks</span></a>
</li>
<li class="">
<a href="/news/">News</a>
</li>
<li class="">
<a href="/develop/"><span class="show-on-mobiles">Dev</span>
<span class="hide-on-mobiles">Develop</span></a>
</li>
<li class="">
<a href="/help/">Help</a>
</li>
</ul>
</nav>
</div>
</header>
<section class="docs">
<div class="grid">
<div class="docs-nav-mobile unit whole show-on-mobiles">
<select onchange="if (this.value) window.location.href=this.value">
<option value="">Navigate the docs…</option>
<optgroup label="Overview">
<option value="/docs/index.html">Background</option>
<option value="/docs/adopters.html">ORC Adopters</option>
<option value="/docs/types.html">Types</option>
<option value="/docs/indexes.html">Indexes</option>
<option value="/docs/acid.html">ACID support</option>
</optgroup>
<optgroup label="Installing">
<option value="/docs/building.html">Building ORC</option>
</optgroup>
<optgroup label="Using in Spark">
<option value="/docs/spark-ddl.html">Spark DDL</option>
<option value="/docs/spark-config.html">Spark Configuration</option>
</optgroup>
<optgroup label="Using in Python">
<option value="/docs/pyarrow.html">PyArrow</option>
<option value="/docs/dask.html">Dask</option>
</optgroup>
<optgroup label="Using in Hive">
<option value="/docs/hive-ddl.html">Hive DDL</option>
<option value="/docs/hive-config.html">Hive Configuration</option>
</optgroup>
<optgroup label="Using in MapReduce">
<option value="/docs/mapred.html">Using in MapRed</option>
<option value="/docs/mapreduce.html">Using in MapReduce</option>
</optgroup>
<optgroup label="Using ORC Core">
<option value="/docs/core-java.html">Using Core Java</option>
<option value="/docs/core-cpp.html">Using Core C++</option>
<option value="/docs/core-java-config.html">ORC Java configuration</option>
</optgroup>
<optgroup label="Tools">
<option value="/docs/cpp-tools.html">C++ Tools</option>
<option value="/docs/java-tools.html">Java Tools</option>
</optgroup>
</select>
</div>
<div class="unit four-fifths">
<article>
<h1>Using Core C++</h1>
<p>The C++ Core ORC API reads and writes ORC files into its own
orc::ColumnVectorBatch vectorized classes.</p>
<h2 id="vectorized-row-batch">Vectorized Row Batch</h2>
<p>Data is passed to ORC as instances of orc::ColumnVectorBatch
that contain the data a batch of rows. The focus is on speed and
accessing the data fields directly. <code class="highlighter-rouge">numElements</code> is the number
of rows. ColumnVectorBatch is the parent type of the different
kinds of columns and has some fields that are shared across
all of the column types. In particular, the <code class="highlighter-rouge">hasNulls</code> flag
if there is any null in this column for this batch. For columns
where <code class="highlighter-rouge">hasNulls == true</code> the <code class="highlighter-rouge">notNull</code> buffer is false if that
value is null.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">namespace</span> <span class="n">orc</span> <span class="p">{</span>
<span class="k">struct</span> <span class="nc">ColumnVectorBatch</span> <span class="p">{</span>
<span class="kt">uint64_t</span> <span class="n">numElements</span><span class="p">;</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">char</span><span class="o">&gt;</span> <span class="n">notNull</span><span class="p">;</span>
<span class="kt">bool</span> <span class="n">hasNulls</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">}</span>
<span class="p">}</span>
</code></pre></div></div>
<p>The subtypes of ColumnVectorBatch are:</p>
<table>
<thead>
<tr>
<th>ORC Type</th>
<th>ColumnVectorBatch</th>
</tr>
</thead>
<tbody>
<tr>
<td>array</td>
<td>ListVectorBatch</td>
</tr>
<tr>
<td>binary</td>
<td>StringVectorBatch</td>
</tr>
<tr>
<td>bigint</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>boolean</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>char</td>
<td>StringVectorBatch</td>
</tr>
<tr>
<td>date</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>decimal</td>
<td>Decimal64VectorBatch, Decimal128VectorBatch</td>
</tr>
<tr>
<td>double</td>
<td>DoubleVectorBatch</td>
</tr>
<tr>
<td>float</td>
<td>DoubleVectorBatch</td>
</tr>
<tr>
<td>int</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>map</td>
<td>MapVectorBatch</td>
</tr>
<tr>
<td>smallint</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>string</td>
<td>StringVectorBatch</td>
</tr>
<tr>
<td>struct</td>
<td>StructVectorBatch</td>
</tr>
<tr>
<td>timestamp</td>
<td>TimestampVectorBatch</td>
</tr>
<tr>
<td>tinyint</td>
<td>LongVectorBatch</td>
</tr>
<tr>
<td>uniontype</td>
<td>UnionVectorBatch</td>
</tr>
<tr>
<td>varchar</td>
<td>StringVectorBatch</td>
</tr>
</tbody>
</table>
<p>LongVectorBatch handles all of the integer types (boolean, bigint,
date, int, smallint, and tinyint). The data is represented as a
buffer of int64_t where each value is sign-extended as necessary.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">LongVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">data</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>TimestampVectorBatch handles timestamp values. The data is
represented as two buffers of int64_t for seconds and nanoseconds
respectively. Note that we always assume data is in GMT timezone;
therefore it is user’s responsibility to convert wall clock time
from local timezone to GMT.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">TimestampVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">data</span><span class="p">;</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">nanoseconds</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>DoubleVectorBatch handles all of the floating point types
(double, and float). The data is represented as a buffer of doubles.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">DoubleVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">&gt;</span> <span class="n">data</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>Decimal64VectorBatch handles decimal columns with precision no
greater than 18. Decimal128VectorBatch handles the others. The data
is represented as a buffer of int64_t and orc::Int128 respectively.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">Decimal64VectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">values</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
<span class="k">struct</span> <span class="nc">Decimal128VectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="n">Int128</span><span class="o">&gt;</span> <span class="n">values</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>StringVectorBatch handles all of the binary types (binary,
char, string, and varchar). The data is represented as a char* buffer,
and a length buffer.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">StringVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">char</span><span class="o">*&gt;</span> <span class="n">data</span><span class="p">;</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">length</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>StructVectorBatch handles the struct columns and represents
the data as a buffer of <code class="highlighter-rouge">ColumnVectorBatch</code>.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">StructVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">*&gt;</span> <span class="n">fields</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>UnionVectorBatch handles the union columns. It uses <code class="highlighter-rouge">tags</code>
to indicate which subtype has the value and <code class="highlighter-rouge">offsets</code> indicates
the offset in child batch of that subtype. A individual
<code class="highlighter-rouge">ColumnVectorBatch</code> is used for each subtype.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">UnionVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">unsigned</span> <span class="kt">char</span><span class="o">&gt;</span> <span class="n">tags</span><span class="p">;</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">uint64_t</span><span class="o">&gt;</span> <span class="n">offsets</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">*&gt;</span> <span class="n">children</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>ListVectorBatch handles the array columns and represents
the data as a buffer of integers for the offsets and a
<code class="highlighter-rouge">ColumnVectorBatch</code> for the children values.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">ListVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">offsets</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">&gt;</span> <span class="n">elements</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<p>MapVectorBatch handles the map columns and represents the data
as two arrays of integers for the offsets and two <code class="highlighter-rouge">ColumnVectorBatch</code>s
for the keys and values.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">MapVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span>
<span class="n">DataBuffer</span><span class="o">&lt;</span><span class="kt">int64_t</span><span class="o">&gt;</span> <span class="n">offsets</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">&gt;</span> <span class="n">keys</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">&gt;</span> <span class="n">elements</span><span class="p">;</span>
<span class="p">...</span>
<span class="p">};</span>
</code></pre></div></div>
<h2 id="writing-orc-files">Writing ORC Files</h2>
<p>To write an ORC file, you need to include <code class="highlighter-rouge">OrcFile.hh</code> and define
the schema; then use <code class="highlighter-rouge">orc::OutputStream</code> and <code class="highlighter-rouge">orc::WriterOptions</code>
to create a <code class="highlighter-rouge">orc::Writer</code> with the desired filename. This example
sets the required schema parameter, but there are many other
options to control the ORC writer.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">OutputStream</span><span class="o">&gt;</span> <span class="n">outStream</span> <span class="o">=</span>
<span class="n">writeLocalFile</span><span class="p">(</span><span class="s">"my-file.orc"</span><span class="p">);</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">Type</span><span class="o">&gt;</span> <span class="n">schema</span><span class="p">(</span>
<span class="n">Type</span><span class="o">::</span><span class="n">buildTypeFromString</span><span class="p">(</span><span class="s">"struct&lt;x:int,y:int&gt;"</span><span class="p">));</span>
<span class="n">WriterOptions</span> <span class="n">options</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">Writer</span><span class="o">&gt;</span> <span class="n">writer</span> <span class="o">=</span>
<span class="n">createWriter</span><span class="p">(</span><span class="o">*</span><span class="n">schema</span><span class="p">,</span> <span class="n">outStream</span><span class="p">.</span><span class="n">get</span><span class="p">(),</span> <span class="n">options</span><span class="p">);</span>
</code></pre></div></div>
<p>Now you need to create a row batch, set the data, and write it to the file
as the batch fills up. When the file is done, close the <code class="highlighter-rouge">Writer</code>.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">uint64_t</span> <span class="n">batchSize</span> <span class="o">=</span> <span class="mi">1024</span><span class="p">,</span> <span class="n">rowCount</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">&gt;</span> <span class="n">batch</span> <span class="o">=</span>
<span class="n">writer</span><span class="o">-&gt;</span><span class="n">createRowBatch</span><span class="p">(</span><span class="n">batchSize</span><span class="p">);</span>
<span class="n">StructVectorBatch</span> <span class="o">*</span><span class="n">root</span> <span class="o">=</span>
<span class="k">dynamic_cast</span><span class="o">&lt;</span><span class="n">StructVectorBatch</span> <span class="o">*&gt;</span><span class="p">(</span><span class="n">batch</span><span class="p">.</span><span class="n">get</span><span class="p">());</span>
<span class="n">LongVectorBatch</span> <span class="o">*</span><span class="n">x</span> <span class="o">=</span>
<span class="k">dynamic_cast</span><span class="o">&lt;</span><span class="n">LongVectorBatch</span> <span class="o">*&gt;</span><span class="p">(</span><span class="n">root</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]);</span>
<span class="n">LongVectorBatch</span> <span class="o">*</span><span class="n">y</span> <span class="o">=</span>
<span class="k">dynamic_cast</span><span class="o">&lt;</span><span class="n">LongVectorBatch</span> <span class="o">*&gt;</span><span class="p">(</span><span class="n">root</span><span class="o">-&gt;</span><span class="n">fields</span><span class="p">[</span><span class="mi">1</span><span class="p">]);</span>
<span class="kt">uint64_t</span> <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="k">for</span> <span class="p">(</span><span class="kt">uint64_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">rowCount</span><span class="p">;</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span>
<span class="n">x</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">[</span><span class="n">rows</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span><span class="p">;</span>
<span class="n">y</span><span class="o">-&gt;</span><span class="n">data</span><span class="p">[</span><span class="n">rows</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">3</span><span class="p">;</span>
<span class="n">rows</span><span class="o">++</span><span class="p">;</span>
<span class="k">if</span> <span class="p">(</span><span class="n">rows</span> <span class="o">==</span> <span class="n">batchSize</span><span class="p">)</span> <span class="p">{</span>
<span class="n">root</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">x</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">y</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">writer</span><span class="o">-&gt;</span><span class="n">add</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">);</span>
<span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="k">if</span> <span class="p">(</span><span class="n">rows</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span>
<span class="n">root</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">x</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">y</span><span class="o">-&gt;</span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span>
<span class="n">writer</span><span class="o">-&gt;</span><span class="n">add</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">);</span>
<span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>
<span class="n">writer</span><span class="o">-&gt;</span><span class="n">close</span><span class="p">();</span>
</code></pre></div></div>
<h2 id="reading-orc-files">Reading ORC Files</h2>
<p>To read ORC files, include <code class="highlighter-rouge">OrcFile.hh</code> file to create a <code class="highlighter-rouge">orc::Reader</code>
that contains the metadata about the file. There are a few options to
the <code class="highlighter-rouge">orc::Reader</code>, but far fewer than the writer and none of them are
required. The reader has methods for getting the number of rows,
schema, compression, etc. from the file.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">InputStream</span><span class="o">&gt;</span> <span class="n">inStream</span> <span class="o">=</span>
<span class="n">readLocalFile</span><span class="p">(</span><span class="s">"my-file.orc"</span><span class="p">);</span>
<span class="n">ReaderOptions</span> <span class="n">options</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">Reader</span><span class="o">&gt;</span> <span class="n">reader</span> <span class="o">=</span>
<span class="n">createReader</span><span class="p">(</span><span class="n">inStream</span><span class="p">,</span> <span class="n">options</span><span class="p">);</span>
</code></pre></div></div>
<p>To get the data, create a <code class="highlighter-rouge">orc::RowReader</code> object. By default,
the RowReader reads all rows and all columns, but there are
options to control the data that is read.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">RowReaderOptions</span> <span class="n">rowReaderOptions</span><span class="p">;</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">RowReader</span><span class="o">&gt;</span> <span class="n">rowReader</span> <span class="o">=</span>
<span class="n">reader</span><span class="o">-&gt;</span><span class="n">createRowReader</span><span class="p">(</span><span class="n">rowReaderOptions</span><span class="p">);</span>
<span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o">&lt;</span><span class="n">ColumnVectorBatch</span><span class="o">&gt;</span> <span class="n">batch</span> <span class="o">=</span>
<span class="n">rowReader</span><span class="o">-&gt;</span><span class="n">createRowBatch</span><span class="p">(</span><span class="mi">1024</span><span class="p">);</span>
</code></pre></div></div>
<p>With a <code class="highlighter-rouge">orc::RowReader</code> the user can ask for the next batch until there
are no more left. The reader will stop the batch at certain boundaries,
so the returned batch may not be full, but it will always contain some rows.</p>
<div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">while</span> <span class="p">(</span><span class="n">rowReader</span><span class="o">-&gt;</span><span class="n">next</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">))</span> <span class="p">{</span>
<span class="k">for</span> <span class="p">(</span><span class="kt">uint64_t</span> <span class="n">r</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">r</span> <span class="o">&lt;</span> <span class="n">batch</span><span class="o">-&gt;</span><span class="n">numElements</span><span class="p">;</span> <span class="o">++</span><span class="n">r</span><span class="p">)</span> <span class="p">{</span>
<span class="p">...</span> <span class="n">process</span> <span class="n">row</span> <span class="n">r</span> <span class="n">from</span> <span class="n">batch</span>
<span class="p">}</span>
<span class="p">}</span>
</code></pre></div></div>
<div class="section-nav">
<div class="left align-right">
<a href="/docs/core-java.html" class="prev">Back</a>
</div>
<div class="right align-left">
<a href="/docs/core-java-config.html" class="next">Next</a>
</div>
</div>
<div class="clear"></div>
</article>
</div>
<div class="unit one-fifth hide-on-mobiles">
<aside>
<h4>Overview</h4>
<ul>
<li class=""><a href="/docs/index.html">Background</a></li>
<li class=""><a href="/docs/adopters.html">ORC Adopters</a></li>
<li class=""><a href="/docs/types.html">Types</a></li>
<li class=""><a href="/docs/indexes.html">Indexes</a></li>
<li class=""><a href="/docs/acid.html">ACID support</a></li>
</ul>
<h4>Installing</h4>
<ul>
<li class=""><a href="/docs/building.html">Building ORC</a></li>
</ul>
<h4>Using in Spark</h4>
<ul>
<li class=""><a href="/docs/spark-ddl.html">Spark DDL</a></li>
<li class=""><a href="/docs/spark-config.html">Spark Configuration</a></li>
</ul>
<h4>Using in Python</h4>
<ul>
<li class=""><a href="/docs/pyarrow.html">PyArrow</a></li>
<li class=""><a href="/docs/dask.html">Dask</a></li>
</ul>
<h4>Using in Hive</h4>
<ul>
<li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li>
<li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li>
</ul>
<h4>Using in MapReduce</h4>
<ul>
<li class=""><a href="/docs/mapred.html">Using in MapRed</a></li>
<li class=""><a href="/docs/mapreduce.html">Using in MapReduce</a></li>
</ul>
<h4>Using ORC Core</h4>
<ul>
<li class=""><a href="/docs/core-java.html">Using Core Java</a></li>
<li class="current"><a href="/docs/core-cpp.html">Using Core C++</a></li>
<li class=""><a href="/docs/core-java-config.html">ORC Java configuration</a></li>
</ul>
<h4>Tools</h4>
<ul>
<li class=""><a href="/docs/cpp-tools.html">C++ Tools</a></li>
<li class=""><a href="/docs/java-tools.html">Java Tools</a></li>
</ul>
</aside>
</div>
<div class="clear"></div>
</div>
</section>
<footer role="contentinfo">
<p style="margin-left: 20px; margin-right; 20px; text-align: center">The contents of this website are &copy;&nbsp;2024
<a href="https://www.apache.org/">Apache Software Foundation</a>
under the terms of the <a
href="https://www.apache.org/licenses/LICENSE-2.0.html">
Apache&nbsp;License&nbsp;v2</a>. Apache ORC and its logo are trademarks
of the Apache Software Foundation.</p>
</footer>
<script>
var anchorForId = function (id) {
var anchor = document.createElement("a");
anchor.className = "header-link";
anchor.href = "#" + id;
anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>";
anchor.title = "Permalink";
return anchor;
};
var linkifyAnchors = function (level, containingElement) {
var headers = containingElement.getElementsByTagName("h" + level);
for (var h = 0; h < headers.length; h++) {
var header = headers[h];
if (typeof header.id !== "undefined" && header.id !== "") {
header.appendChild(anchorForId(header.id));
}
}
};
document.onreadystatechange = function () {
if (this.readyState === "complete") {
var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0];
if (!contentBlock) {
return;
}
for (var level = 1; level <= 6; level++) {
linkifyAnchors(level, contentBlock);
}
}
};
</script>
</body>
</html>