| <!DOCTYPE HTML> |
| <html lang="en-US"> |
| <head> |
| <meta charset="UTF-8"> |
| <title>Using Core C++</title> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Jekyll v3.8.6"> |
| <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> |
| <link rel="stylesheet" href="/css/screen.css"> |
| <link rel="icon" type="image/x-icon" href="/favicon.ico"> |
| <!--[if lt IE 9]> |
| <script src="/js/html5shiv.min.js"></script> |
| <script src="/js/respond.min.js"></script> |
| <![endif]--> |
| </head> |
| |
| |
| <body class="wrap"> |
| <header role="banner"> |
| <nav class="mobile-nav show-on-mobiles"> |
| <ul> |
| <li class=""> |
| <a href="/">Home</a> |
| </li> |
| <li class=""> |
| <a href="/releases/"><span class="show-on-mobiles">Rel</span> |
| <span class="hide-on-mobiles">Releases</span></a> |
| </li> |
| <li class="current"> |
| <a href="/docs/"><span class="show-on-mobiles">Doc</span> |
| <span class="hide-on-mobiles">Documentation</span></a> |
| </li> |
| <li class=""> |
| <a href="/talks/"><span class="show-on-mobiles">Talk</span> |
| <span class="hide-on-mobiles">Talks</span></a> |
| </li> |
| <li class=""> |
| <a href="/news/">News</a> |
| </li> |
| <li class=""> |
| <a href="/develop/"><span class="show-on-mobiles">Dev</span> |
| <span class="hide-on-mobiles">Develop</span></a> |
| </li> |
| <li class=""> |
| <a href="/help/">Help</a> |
| </li> |
| </ul> |
| |
| </nav> |
| <div class="grid"> |
| <div class="unit one-quarter center-on-mobiles"> |
| <h1> |
| <a href="/"> |
| <span class="sr-only">Apache ORC</span> |
| <img src="/img/logo.png" width="249" height="101" alt="ORC Logo"> |
| </a> |
| </h1> |
| </div> |
| <nav class="main-nav unit three-quarters hide-on-mobiles"> |
| <ul> |
| <li class=""> |
| <a href="/">Home</a> |
| </li> |
| <li class=""> |
| <a href="/releases/"><span class="show-on-mobiles">Rel</span> |
| <span class="hide-on-mobiles">Releases</span></a> |
| </li> |
| <li class="current"> |
| <a href="/docs/"><span class="show-on-mobiles">Doc</span> |
| <span class="hide-on-mobiles">Documentation</span></a> |
| </li> |
| <li class=""> |
| <a href="/talks/"><span class="show-on-mobiles">Talk</span> |
| <span class="hide-on-mobiles">Talks</span></a> |
| </li> |
| <li class=""> |
| <a href="/news/">News</a> |
| </li> |
| <li class=""> |
| <a href="/develop/"><span class="show-on-mobiles">Dev</span> |
| <span class="hide-on-mobiles">Develop</span></a> |
| </li> |
| <li class=""> |
| <a href="/help/">Help</a> |
| </li> |
| </ul> |
| |
| </nav> |
| </div> |
| </header> |
| |
| |
| <section class="docs"> |
| <div class="grid"> |
| |
| <div class="docs-nav-mobile unit whole show-on-mobiles"> |
| <select onchange="if (this.value) window.location.href=this.value"> |
| <option value="">Navigate the docs…</option> |
| |
| <optgroup label="Overview"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/index.html">Background</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/adopters.html">ORC Adopters</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/types.html">Types</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/indexes.html">Indexes</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/acid.html">ACID support</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Installing"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/building.html">Building ORC</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Spark"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/spark-ddl.html">Spark DDL</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/spark-config.html">Spark Configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Python"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/pyarrow.html">PyArrow</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/dask.html">Dask</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Hive"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/hive-ddl.html">Hive DDL</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/hive-config.html">Hive Configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in MapReduce"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/mapred.html">Using in MapRed</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/mapreduce.html">Using in MapReduce</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using ORC Core"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-java.html">Using Core Java</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-cpp.html">Using Core C++</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-java-config.html">ORC Java configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Tools"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/cpp-tools.html">C++ Tools</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/java-tools.html">Java Tools</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| </select> |
| </div> |
| |
| |
| <div class="unit four-fifths"> |
| <article> |
| <h1>Using Core C++</h1> |
| <p>The C++ Core ORC API reads and writes ORC files into its own |
| orc::ColumnVectorBatch vectorized classes.</p> |
| |
| <h2 id="vectorized-row-batch">Vectorized Row Batch</h2> |
| |
| <p>Data is passed to ORC as instances of orc::ColumnVectorBatch |
| that contain the data a batch of rows. The focus is on speed and |
| accessing the data fields directly. <code class="highlighter-rouge">numElements</code> is the number |
| of rows. ColumnVectorBatch is the parent type of the different |
| kinds of columns and has some fields that are shared across |
| all of the column types. In particular, the <code class="highlighter-rouge">hasNulls</code> flag |
| if there is any null in this column for this batch. For columns |
| where <code class="highlighter-rouge">hasNulls == true</code> the <code class="highlighter-rouge">notNull</code> buffer is false if that |
| value is null.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">namespace</span> <span class="n">orc</span> <span class="p">{</span> |
| <span class="k">struct</span> <span class="nc">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="kt">uint64_t</span> <span class="n">numElements</span><span class="p">;</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">char</span><span class="o">></span> <span class="n">notNull</span><span class="p">;</span> |
| <span class="kt">bool</span> <span class="n">hasNulls</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">}</span> |
| <span class="p">}</span> |
| </code></pre></div></div> |
| |
| <p>The subtypes of ColumnVectorBatch are:</p> |
| |
| <table> |
| <thead> |
| <tr> |
| <th>ORC Type</th> |
| <th>ColumnVectorBatch</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>array</td> |
| <td>ListVectorBatch</td> |
| </tr> |
| <tr> |
| <td>binary</td> |
| <td>StringVectorBatch</td> |
| </tr> |
| <tr> |
| <td>bigint</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>boolean</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>char</td> |
| <td>StringVectorBatch</td> |
| </tr> |
| <tr> |
| <td>date</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>decimal</td> |
| <td>Decimal64VectorBatch, Decimal128VectorBatch</td> |
| </tr> |
| <tr> |
| <td>double</td> |
| <td>DoubleVectorBatch</td> |
| </tr> |
| <tr> |
| <td>float</td> |
| <td>DoubleVectorBatch</td> |
| </tr> |
| <tr> |
| <td>int</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>map</td> |
| <td>MapVectorBatch</td> |
| </tr> |
| <tr> |
| <td>smallint</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>string</td> |
| <td>StringVectorBatch</td> |
| </tr> |
| <tr> |
| <td>struct</td> |
| <td>StructVectorBatch</td> |
| </tr> |
| <tr> |
| <td>timestamp</td> |
| <td>TimestampVectorBatch</td> |
| </tr> |
| <tr> |
| <td>tinyint</td> |
| <td>LongVectorBatch</td> |
| </tr> |
| <tr> |
| <td>uniontype</td> |
| <td>UnionVectorBatch</td> |
| </tr> |
| <tr> |
| <td>varchar</td> |
| <td>StringVectorBatch</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <p>LongVectorBatch handles all of the integer types (boolean, bigint, |
| date, int, smallint, and tinyint). The data is represented as a |
| buffer of int64_t where each value is sign-extended as necessary.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">LongVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">data</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>TimestampVectorBatch handles timestamp values. The data is |
| represented as two buffers of int64_t for seconds and nanoseconds |
| respectively. Note that we always assume data is in GMT timezone; |
| therefore it is user’s responsibility to convert wall clock time |
| from local timezone to GMT.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">TimestampVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">data</span><span class="p">;</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">nanoseconds</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>DoubleVectorBatch handles all of the floating point types |
| (double, and float). The data is represented as a buffer of doubles.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">DoubleVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">double</span><span class="o">></span> <span class="n">data</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>Decimal64VectorBatch handles decimal columns with precision no |
| greater than 18. Decimal128VectorBatch handles the others. The data |
| is represented as a buffer of int64_t and orc::Int128 respectively.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">Decimal64VectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">values</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| |
| <span class="k">struct</span> <span class="nc">Decimal128VectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="n">Int128</span><span class="o">></span> <span class="n">values</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>StringVectorBatch handles all of the binary types (binary, |
| char, string, and varchar). The data is represented as a char* buffer, |
| and a length buffer.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">StringVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">char</span><span class="o">*></span> <span class="n">data</span><span class="p">;</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">length</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>StructVectorBatch handles the struct columns and represents |
| the data as a buffer of <code class="highlighter-rouge">ColumnVectorBatch</code>.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">StructVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">*></span> <span class="n">fields</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>UnionVectorBatch handles the union columns. It uses <code class="highlighter-rouge">tags</code> |
| to indicate which subtype has the value and <code class="highlighter-rouge">offsets</code> indicates |
| the offset in child batch of that subtype. A individual |
| <code class="highlighter-rouge">ColumnVectorBatch</code> is used for each subtype.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">UnionVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">unsigned</span> <span class="kt">char</span><span class="o">></span> <span class="n">tags</span><span class="p">;</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">uint64_t</span><span class="o">></span> <span class="n">offsets</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">vector</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">*></span> <span class="n">children</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>ListVectorBatch handles the array columns and represents |
| the data as a buffer of integers for the offsets and a |
| <code class="highlighter-rouge">ColumnVectorBatch</code> for the children values.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">ListVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">offsets</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">></span> <span class="n">elements</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <p>MapVectorBatch handles the map columns and represents the data |
| as two arrays of integers for the offsets and two <code class="highlighter-rouge">ColumnVectorBatch</code>s |
| for the keys and values.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="k">struct</span> <span class="nc">MapVectorBatch</span><span class="o">:</span> <span class="k">public</span> <span class="n">ColumnVectorBatch</span> <span class="p">{</span> |
| <span class="n">DataBuffer</span><span class="o"><</span><span class="kt">int64_t</span><span class="o">></span> <span class="n">offsets</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">></span> <span class="n">keys</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">></span> <span class="n">elements</span><span class="p">;</span> |
| <span class="p">...</span> |
| <span class="p">};</span> |
| </code></pre></div></div> |
| |
| <h2 id="writing-orc-files">Writing ORC Files</h2> |
| |
| <p>To write an ORC file, you need to include <code class="highlighter-rouge">OrcFile.hh</code> and define |
| the schema; then use <code class="highlighter-rouge">orc::OutputStream</code> and <code class="highlighter-rouge">orc::WriterOptions</code> |
| to create a <code class="highlighter-rouge">orc::Writer</code> with the desired filename. This example |
| sets the required schema parameter, but there are many other |
| options to control the ORC writer.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">OutputStream</span><span class="o">></span> <span class="n">outStream</span> <span class="o">=</span> |
| <span class="n">writeLocalFile</span><span class="p">(</span><span class="s">"my-file.orc"</span><span class="p">);</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">Type</span><span class="o">></span> <span class="n">schema</span><span class="p">(</span> |
| <span class="n">Type</span><span class="o">::</span><span class="n">buildTypeFromString</span><span class="p">(</span><span class="s">"struct<x:int,y:int>"</span><span class="p">));</span> |
| <span class="n">WriterOptions</span> <span class="n">options</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">Writer</span><span class="o">></span> <span class="n">writer</span> <span class="o">=</span> |
| <span class="n">createWriter</span><span class="p">(</span><span class="o">*</span><span class="n">schema</span><span class="p">,</span> <span class="n">outStream</span><span class="p">.</span><span class="n">get</span><span class="p">(),</span> <span class="n">options</span><span class="p">);</span> |
| </code></pre></div></div> |
| |
| <p>Now you need to create a row batch, set the data, and write it to the file |
| as the batch fills up. When the file is done, close the <code class="highlighter-rouge">Writer</code>.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">uint64_t</span> <span class="n">batchSize</span> <span class="o">=</span> <span class="mi">1024</span><span class="p">,</span> <span class="n">rowCount</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">></span> <span class="n">batch</span> <span class="o">=</span> |
| <span class="n">writer</span><span class="o">-></span><span class="n">createRowBatch</span><span class="p">(</span><span class="n">batchSize</span><span class="p">);</span> |
| <span class="n">StructVectorBatch</span> <span class="o">*</span><span class="n">root</span> <span class="o">=</span> |
| <span class="k">dynamic_cast</span><span class="o"><</span><span class="n">StructVectorBatch</span> <span class="o">*></span><span class="p">(</span><span class="n">batch</span><span class="p">.</span><span class="n">get</span><span class="p">());</span> |
| <span class="n">LongVectorBatch</span> <span class="o">*</span><span class="n">x</span> <span class="o">=</span> |
| <span class="k">dynamic_cast</span><span class="o"><</span><span class="n">LongVectorBatch</span> <span class="o">*></span><span class="p">(</span><span class="n">root</span><span class="o">-></span><span class="n">fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]);</span> |
| <span class="n">LongVectorBatch</span> <span class="o">*</span><span class="n">y</span> <span class="o">=</span> |
| <span class="k">dynamic_cast</span><span class="o"><</span><span class="n">LongVectorBatch</span> <span class="o">*></span><span class="p">(</span><span class="n">root</span><span class="o">-></span><span class="n">fields</span><span class="p">[</span><span class="mi">1</span><span class="p">]);</span> |
| |
| <span class="kt">uint64_t</span> <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> |
| <span class="k">for</span> <span class="p">(</span><span class="kt">uint64_t</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">i</span> <span class="o"><</span> <span class="n">rowCount</span><span class="p">;</span> <span class="o">++</span><span class="n">i</span><span class="p">)</span> <span class="p">{</span> |
| <span class="n">x</span><span class="o">-></span><span class="n">data</span><span class="p">[</span><span class="n">rows</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span><span class="p">;</span> |
| <span class="n">y</span><span class="o">-></span><span class="n">data</span><span class="p">[</span><span class="n">rows</span><span class="p">]</span> <span class="o">=</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">3</span><span class="p">;</span> |
| <span class="n">rows</span><span class="o">++</span><span class="p">;</span> |
| |
| <span class="k">if</span> <span class="p">(</span><span class="n">rows</span> <span class="o">==</span> <span class="n">batchSize</span><span class="p">)</span> <span class="p">{</span> |
| <span class="n">root</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| <span class="n">x</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| <span class="n">y</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| |
| <span class="n">writer</span><span class="o">-></span><span class="n">add</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">);</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> |
| <span class="p">}</span> |
| <span class="p">}</span> |
| |
| <span class="k">if</span> <span class="p">(</span><span class="n">rows</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">)</span> <span class="p">{</span> |
| <span class="n">root</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| <span class="n">x</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| <span class="n">y</span><span class="o">-></span><span class="n">numElements</span> <span class="o">=</span> <span class="n">rows</span><span class="p">;</span> |
| |
| <span class="n">writer</span><span class="o">-></span><span class="n">add</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">);</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> |
| <span class="p">}</span> |
| |
| <span class="n">writer</span><span class="o">-></span><span class="n">close</span><span class="p">();</span> |
| </code></pre></div></div> |
| |
| <h2 id="reading-orc-files">Reading ORC Files</h2> |
| |
| <p>To read ORC files, include <code class="highlighter-rouge">OrcFile.hh</code> file to create a <code class="highlighter-rouge">orc::Reader</code> |
| that contains the metadata about the file. There are a few options to |
| the <code class="highlighter-rouge">orc::Reader</code>, but far fewer than the writer and none of them are |
| required. The reader has methods for getting the number of rows, |
| schema, compression, etc. from the file.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">InputStream</span><span class="o">></span> <span class="n">inStream</span> <span class="o">=</span> |
| <span class="n">readLocalFile</span><span class="p">(</span><span class="s">"my-file.orc"</span><span class="p">);</span> |
| <span class="n">ReaderOptions</span> <span class="n">options</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">Reader</span><span class="o">></span> <span class="n">reader</span> <span class="o">=</span> |
| <span class="n">createReader</span><span class="p">(</span><span class="n">inStream</span><span class="p">,</span> <span class="n">options</span><span class="p">);</span> |
| </code></pre></div></div> |
| |
| <p>To get the data, create a <code class="highlighter-rouge">orc::RowReader</code> object. By default, |
| the RowReader reads all rows and all columns, but there are |
| options to control the data that is read.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">RowReaderOptions</span> <span class="n">rowReaderOptions</span><span class="p">;</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">RowReader</span><span class="o">></span> <span class="n">rowReader</span> <span class="o">=</span> |
| <span class="n">reader</span><span class="o">-></span><span class="n">createRowReader</span><span class="p">(</span><span class="n">rowReaderOptions</span><span class="p">);</span> |
| <span class="n">std</span><span class="o">::</span><span class="n">unique_ptr</span><span class="o"><</span><span class="n">ColumnVectorBatch</span><span class="o">></span> <span class="n">batch</span> <span class="o">=</span> |
| <span class="n">rowReader</span><span class="o">-></span><span class="n">createRowBatch</span><span class="p">(</span><span class="mi">1024</span><span class="p">);</span> |
| </code></pre></div></div> |
| |
| <p>With a <code class="highlighter-rouge">orc::RowReader</code> the user can ask for the next batch until there |
| are no more left. The reader will stop the batch at certain boundaries, |
| so the returned batch may not be full, but it will always contain some rows.</p> |
| |
| <div class="language-cpp highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">while</span> <span class="p">(</span><span class="n">rowReader</span><span class="o">-></span><span class="n">next</span><span class="p">(</span><span class="o">*</span><span class="n">batch</span><span class="p">))</span> <span class="p">{</span> |
| <span class="k">for</span> <span class="p">(</span><span class="kt">uint64_t</span> <span class="n">r</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span> <span class="n">r</span> <span class="o"><</span> <span class="n">batch</span><span class="o">-></span><span class="n">numElements</span><span class="p">;</span> <span class="o">++</span><span class="n">r</span><span class="p">)</span> <span class="p">{</span> |
| <span class="p">...</span> <span class="n">process</span> <span class="n">row</span> <span class="n">r</span> <span class="n">from</span> <span class="n">batch</span> |
| <span class="p">}</span> |
| <span class="p">}</span> |
| </code></pre></div></div> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div class="section-nav"> |
| <div class="left align-right"> |
| |
| |
| |
| <a href="/docs/core-java.html" class="prev">Back</a> |
| |
| </div> |
| <div class="right align-left"> |
| |
| |
| |
| <a href="/docs/core-java-config.html" class="next">Next</a> |
| |
| </div> |
| </div> |
| <div class="clear"></div> |
| |
| |
| </article> |
| </div> |
| |
| <div class="unit one-fifth hide-on-mobiles"> |
| <aside> |
| |
| <h4>Overview</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/index.html">Background</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/adopters.html">ORC Adopters</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/types.html">Types</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/indexes.html">Indexes</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/acid.html">ACID support</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Installing</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/building.html">Building ORC</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Spark</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/spark-ddl.html">Spark DDL</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/spark-config.html">Spark Configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Python</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/pyarrow.html">PyArrow</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/dask.html">Dask</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Hive</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in MapReduce</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/mapred.html">Using in MapRed</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/mapreduce.html">Using in MapReduce</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using ORC Core</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/core-java.html">Using Core Java</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class="current"><a href="/docs/core-cpp.html">Using Core C++</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/core-java-config.html">ORC Java configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Tools</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/cpp-tools.html">C++ Tools</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/java-tools.html">Java Tools</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| </aside> |
| </div> |
| |
| |
| <div class="clear"></div> |
| |
| </div> |
| </section> |
| |
| |
| <footer role="contentinfo"> |
| <p style="margin-left: 20px; margin-right; 20px; text-align: center">The contents of this website are © 2024 |
| <a href="https://www.apache.org/">Apache Software Foundation</a> |
| under the terms of the <a |
| href="https://www.apache.org/licenses/LICENSE-2.0.html"> |
| Apache License v2</a>. Apache ORC and its logo are trademarks |
| of the Apache Software Foundation.</p> |
| </footer> |
| |
| <script> |
| var anchorForId = function (id) { |
| var anchor = document.createElement("a"); |
| anchor.className = "header-link"; |
| anchor.href = "#" + id; |
| anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; |
| anchor.title = "Permalink"; |
| return anchor; |
| }; |
| |
| var linkifyAnchors = function (level, containingElement) { |
| var headers = containingElement.getElementsByTagName("h" + level); |
| for (var h = 0; h < headers.length; h++) { |
| var header = headers[h]; |
| |
| if (typeof header.id !== "undefined" && header.id !== "") { |
| header.appendChild(anchorForId(header.id)); |
| } |
| } |
| }; |
| |
| document.onreadystatechange = function () { |
| if (this.readyState === "complete") { |
| var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; |
| if (!contentBlock) { |
| return; |
| } |
| for (var level = 1; level <= 6; level++) { |
| linkifyAnchors(level, contentBlock); |
| } |
| } |
| }; |
| </script> |
| |
| |
| </body> |
| </html> |