| <!DOCTYPE HTML> |
| <html lang="en-US"> |
| <head> |
| <meta charset="UTF-8"> |
| <title>Using in MapReduce</title> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta name="generator" content="Jekyll v3.8.6"> |
| <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> |
| <link rel="stylesheet" href="/css/screen.css"> |
| <link rel="icon" type="image/x-icon" href="/favicon.ico"> |
| <!--[if lt IE 9]> |
| <script src="/js/html5shiv.min.js"></script> |
| <script src="/js/respond.min.js"></script> |
| <![endif]--> |
| </head> |
| |
| |
| <body class="wrap"> |
| <header role="banner"> |
| <nav class="mobile-nav show-on-mobiles"> |
| <ul> |
| <li class=""> |
| <a href="/">Home</a> |
| </li> |
| <li class=""> |
| <a href="/releases/"><span class="show-on-mobiles">Rel</span> |
| <span class="hide-on-mobiles">Releases</span></a> |
| </li> |
| <li class="current"> |
| <a href="/docs/"><span class="show-on-mobiles">Doc</span> |
| <span class="hide-on-mobiles">Documentation</span></a> |
| </li> |
| <li class=""> |
| <a href="/talks/"><span class="show-on-mobiles">Talk</span> |
| <span class="hide-on-mobiles">Talks</span></a> |
| </li> |
| <li class=""> |
| <a href="/news/">News</a> |
| </li> |
| <li class=""> |
| <a href="/develop/"><span class="show-on-mobiles">Dev</span> |
| <span class="hide-on-mobiles">Develop</span></a> |
| </li> |
| <li class=""> |
| <a href="/help/">Help</a> |
| </li> |
| </ul> |
| |
| </nav> |
| <div class="grid"> |
| <div class="unit one-quarter center-on-mobiles"> |
| <h1> |
| <a href="/"> |
| <span class="sr-only">Apache ORC</span> |
| <img src="/img/logo.png" width="249" height="101" alt="ORC Logo"> |
| </a> |
| </h1> |
| </div> |
| <nav class="main-nav unit three-quarters hide-on-mobiles"> |
| <ul> |
| <li class=""> |
| <a href="/">Home</a> |
| </li> |
| <li class=""> |
| <a href="/releases/"><span class="show-on-mobiles">Rel</span> |
| <span class="hide-on-mobiles">Releases</span></a> |
| </li> |
| <li class="current"> |
| <a href="/docs/"><span class="show-on-mobiles">Doc</span> |
| <span class="hide-on-mobiles">Documentation</span></a> |
| </li> |
| <li class=""> |
| <a href="/talks/"><span class="show-on-mobiles">Talk</span> |
| <span class="hide-on-mobiles">Talks</span></a> |
| </li> |
| <li class=""> |
| <a href="/news/">News</a> |
| </li> |
| <li class=""> |
| <a href="/develop/"><span class="show-on-mobiles">Dev</span> |
| <span class="hide-on-mobiles">Develop</span></a> |
| </li> |
| <li class=""> |
| <a href="/help/">Help</a> |
| </li> |
| </ul> |
| |
| </nav> |
| </div> |
| </header> |
| |
| |
| <section class="docs"> |
| <div class="grid"> |
| |
| <div class="docs-nav-mobile unit whole show-on-mobiles"> |
| <select onchange="if (this.value) window.location.href=this.value"> |
| <option value="">Navigate the docs…</option> |
| |
| <optgroup label="Overview"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/index.html">Background</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/adopters.html">ORC Adopters</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/types.html">Types</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/indexes.html">Indexes</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/acid.html">ACID support</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Installing"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/building.html">Building ORC</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Spark"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/spark-ddl.html">Spark DDL</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/spark-config.html">Spark Configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Python"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/pyarrow.html">PyArrow</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/dask.html">Dask</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in Hive"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/hive-ddl.html">Hive DDL</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/hive-config.html">Hive Configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using in MapReduce"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/mapred.html">Using in MapRed</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/mapreduce.html">Using in MapReduce</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Using ORC Core"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-java.html">Using Core Java</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-cpp.html">Using Core C++</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/core-java-config.html">ORC Java configuration</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| <optgroup label="Tools"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/cpp-tools.html">C++ Tools</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <option value="/docs/java-tools.html">Java Tools</option> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </optgroup> |
| |
| </select> |
| </div> |
| |
| |
| <div class="unit four-fifths"> |
| <article> |
| <h1>Using in MapReduce</h1> |
| <p>This page describes how to read and write ORC files from Hadoop’s |
| newer org.apache.hadoop.mapreduce MapReduce APIs. If you want to use the |
| older org.apache.hadoop.mapred API, please look at the <a href="/docs/mapred.html">previous |
| page</a>.</p> |
| |
| <h2 id="reading-orc-files">Reading ORC files</h2> |
| |
| <p>Add ORC and your desired version of Hadoop to your <code class="highlighter-rouge">pom.xml</code>:</p> |
| |
| <div class="language-xml highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nt"><dependencies></span> |
| <span class="nt"><dependency></span> |
| <span class="nt"><groupId></span>org.apache.orc<span class="nt"></groupId></span> |
| <span class="nt"><artifactId></span>orc-mapreduce<span class="nt"></artifactId></span> |
| <span class="nt"><version></span>1.1.0<span class="nt"></version></span> |
| <span class="nt"></dependency></span> |
| <span class="nt"><dependency></span> |
| <span class="nt"><groupId></span>org.apache.hadoop<span class="nt"></groupId></span> |
| <span class="nt"><artifactId></span>hadoop-mapreduce-client-core<span class="nt"></artifactId></span> |
| <span class="nt"><version></span>2.7.0<span class="nt"></version></span> |
| <span class="nt"></dependency></span> |
| <span class="nt"></dependencies></span> |
| </code></pre></div></div> |
| |
| <p>Set the minimal properties in your JobConf:</p> |
| |
| <ul> |
| <li><strong>mapreduce.job.inputformat.class</strong> = <a href="/api/orc-mapreduce/index.html?org/apache/orc/mapreduce/OrcInputFormat.html">org.apache.orc.mapreduce.OrcInputFormat</a></li> |
| <li><strong>mapreduce.input.fileinputformat.inputdir</strong> = your input directory</li> |
| </ul> |
| |
| <p>ORC files contain a series of values of the same type and that type |
| schema is encoded in the file. Because the ORC files are |
| self-describing, the reader always knows how to correctly interpret |
| the data. All of the ORC files written by Hive and most of the others have |
| a struct as the value type.</p> |
| |
| <p>Your Mapper class will receive org.apache.hadoop.io.NullWritable as |
| the key and a value based on the table below expanded recursively.</p> |
| |
| <table> |
| <thead> |
| <tr> |
| <th>ORC Type</th> |
| <th>Writable Type</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>array</td> |
| <td><a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcStruct.html">org.apache.orc.mapred.OrcList</a></td> |
| </tr> |
| <tr> |
| <td>binary</td> |
| <td>org.apache.hadoop.io.BytesWritable</td> |
| </tr> |
| <tr> |
| <td>bigint</td> |
| <td>org.apache.hadoop.io.LongWritable</td> |
| </tr> |
| <tr> |
| <td>boolean</td> |
| <td>org.apache.hadoop.io.BooleanWritable</td> |
| </tr> |
| <tr> |
| <td>char</td> |
| <td>org.apache.hadoop.io.Text</td> |
| </tr> |
| <tr> |
| <td>date</td> |
| <td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/serde2/io/DateWritable.html">org.apache.hadoop.hive.serde2.io.DateWritable</a></td> |
| </tr> |
| <tr> |
| <td>decimal</td> |
| <td><a href="https://javadoc.io/static/org.apache.hive/hive-storage-api/2.8.1/org/apache/hadoop/hive/serde2/io/HiveDecimalWritable.html">org.apache.hadoop.hive.serde2.io.HiveDecimalWritable</a></td> |
| </tr> |
| <tr> |
| <td>double</td> |
| <td>org.apache.hadoop.io.DoubleWritable</td> |
| </tr> |
| <tr> |
| <td>float</td> |
| <td>org.apache.hadoop.io.FloatWritable</td> |
| </tr> |
| <tr> |
| <td>int</td> |
| <td>org.apache.hadoop.io.IntWritable</td> |
| </tr> |
| <tr> |
| <td>map</td> |
| <td><a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcMap.html">org.apache.orc.mapred.OrcMap</a></td> |
| </tr> |
| <tr> |
| <td>smallint</td> |
| <td>org.apache.hadoop.io.ShortWritable</td> |
| </tr> |
| <tr> |
| <td>string</td> |
| <td>org.apache.hadoop.io.Text</td> |
| </tr> |
| <tr> |
| <td>struct</td> |
| <td><a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcStruct.html">org.apache.orc.mapred.OrcStruct</a></td> |
| </tr> |
| <tr> |
| <td>timestamp</td> |
| <td><a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcTimestamp.html">org.apache.orc.mapred.OrcTimestamp</a></td> |
| </tr> |
| <tr> |
| <td>tinyint</td> |
| <td>org.apache.hadoop.io.ByteWritable</td> |
| </tr> |
| <tr> |
| <td>uniontype</td> |
| <td><a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcUnion.html">org.apache.orc.mapred.OrcUnion</a></td> |
| </tr> |
| <tr> |
| <td>varchar</td> |
| <td>org.apache.hadoop.io.Text</td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <p>Let’s assume that your input directory contains ORC files with the |
| schema <code class="highlighter-rouge">struct<s:string,i:int></code> and you want to use the string field |
| as the key to the MapReduce shuffle and the integer as the value. The |
| mapper code would look like:</p> |
| |
| <div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">MyMapper</span> |
| <span class="kd">extends</span> <span class="nc">Mapper</span><span class="o"><</span><span class="nc">NullWritable</span><span class="o">,</span><span class="nc">OrcStruct</span><span class="o">,</span><span class="nc">Text</span><span class="o">,</span><span class="nc">IntWritable</span><span class="o">></span> <span class="o">{</span> |
| |
| <span class="c1">// Assume the ORC file has type: struct<s:string,i:int></span> |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">map</span><span class="o">(</span><span class="nc">NullWritable</span> <span class="n">key</span><span class="o">,</span> <span class="nc">OrcStruct</span> <span class="n">value</span><span class="o">,</span> |
| <span class="nc">Context</span> <span class="n">output</span><span class="o">)</span> <span class="kd">throws</span> <span class="nc">IOException</span><span class="o">,</span> <span class="nc">InterruptedException</span> <span class="o">{</span> |
| <span class="c1">// take the first field as the key and the second field as the value</span> |
| <span class="n">output</span><span class="o">.</span><span class="na">write</span><span class="o">((</span><span class="nc">Text</span><span class="o">)</span> <span class="n">value</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> |
| <span class="o">(</span><span class="nc">IntWritable</span><span class="o">)</span> <span class="n">value</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="mi">1</span><span class="o">));</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </code></pre></div></div> |
| |
| <h2 id="writing-orc-files">Writing ORC files</h2> |
| |
| <p>To write ORC files from your MapReduce job, you’ll need to set</p> |
| |
| <ul> |
| <li><strong>mapreduce.job.outputformat.class</strong> = <a href="/api/orc-mapreduce/index.html?org/apache/orc/mapreduce/OrcOutputFormat.html">org.apache.orc.mapreduce.OrcOutputFormat</a></li> |
| <li><strong>mapreduce.output.fileoutputformat.outputdir</strong> = your output directory</li> |
| <li><strong>orc.mapred.output.schema</strong> = the schema to write to the ORC file</li> |
| </ul> |
| |
| <p>The reducer needs to create the Writable value to be put into the ORC |
| file and typically uses the OrcStruct.createValue(TypeDescription) |
| function. For our example, let’s assume that the shuffle types are |
| (Text, IntWritable) from the previous section and the reduce should |
| gather the integer for each key together and write them as a list. The |
| output schema would be <code class="highlighter-rouge">struct<key:string,ints:array<int>></code>. As always |
| with MapReduce, if your method stores the values, you need to copy their |
| value before getting the next.</p> |
| |
| <div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">MyReducer</span> |
| <span class="kd">extends</span> <span class="nc">Reducer</span><span class="o"><</span><span class="nc">Text</span><span class="o">,</span><span class="nc">IntWritable</span><span class="o">,</span><span class="nc">NullWritable</span><span class="o">,</span><span class="nc">OrcStruct</span><span class="o">></span> <span class="o">{</span> |
| |
| <span class="kd">private</span> <span class="nc">TypeDescription</span> <span class="n">schema</span> <span class="o">=</span> |
| <span class="nc">TypeDescription</span><span class="o">.</span><span class="na">fromString</span><span class="o">(</span><span class="s">"struct<key:string,ints:array<int>>"</span><span class="o">);</span> |
| <span class="c1">// createValue creates the correct value type for the schema</span> |
| <span class="kd">private</span> <span class="nc">OrcStruct</span> <span class="n">pair</span> <span class="o">=</span> <span class="o">(</span><span class="nc">OrcStruct</span><span class="o">)</span> <span class="nc">OrcStruct</span><span class="o">.</span><span class="na">createValue</span><span class="o">(</span><span class="n">schema</span><span class="o">);</span> |
| <span class="c1">// get a handle to the list of ints</span> |
| <span class="kd">private</span> <span class="nc">OrcList</span><span class="o"><</span><span class="nc">IntWritable</span><span class="o">></span> <span class="n">valueList</span> <span class="o">=</span> |
| <span class="o">(</span><span class="nc">OrcList</span><span class="o"><</span><span class="nc">IntWritable</span><span class="o">>)</span> <span class="n">pair</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="mi">1</span><span class="o">);</span> |
| <span class="kd">private</span> <span class="kd">final</span> <span class="nc">NullWritable</span> <span class="n">nada</span> <span class="o">=</span> <span class="nc">NullWritable</span><span class="o">.</span><span class="na">get</span><span class="o">();</span> |
| |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">reduce</span><span class="o">(</span><span class="nc">Text</span> <span class="n">key</span><span class="o">,</span> <span class="nc">Iterable</span><span class="o"><</span><span class="nc">IntWritable</span><span class="o">></span> <span class="n">values</span><span class="o">,</span> |
| <span class="nc">Context</span> <span class="n">output</span> |
| <span class="o">)</span> <span class="kd">throws</span> <span class="nc">IOException</span><span class="o">,</span> <span class="nc">InterruptedException</span> <span class="o">{</span> |
| <span class="n">pair</span><span class="o">.</span><span class="na">setFieldValue</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="n">key</span><span class="o">);</span> |
| <span class="n">valueList</span><span class="o">.</span><span class="na">clear</span><span class="o">();</span> |
| <span class="k">for</span><span class="o">(</span><span class="nc">IntWritable</span> <span class="nl">val:</span> <span class="n">values</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">valueList</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="k">new</span> <span class="nc">IntWritable</span><span class="o">(</span><span class="n">val</span><span class="o">.</span><span class="na">get</span><span class="o">()));</span> |
| <span class="o">}</span> |
| <span class="n">output</span><span class="o">.</span><span class="na">write</span><span class="o">(</span><span class="n">nada</span><span class="o">,</span> <span class="n">pair</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </code></pre></div></div> |
| |
| <h2 id="sending-orcstruct-orclist-orcmap-or-orcunion-through-the-shuffle">Sending OrcStruct, OrcList, OrcMap, or OrcUnion through the Shuffle</h2> |
| |
| <p>In the previous examples, only the Hadoop types were sent through the |
| MapReduce shuffle. The complex ORC types, since they are generic |
| types, need to have their full type information provided to create the |
| object. To enable MapReduce to properly instantiate the OrcStruct and |
| other ORC types, we need to wrap it in either an |
| <a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcKey.html">OrcKey</a> |
| for the shuffle key or |
| <a href="/api/orc-mapreduce/index.html?org/apache/orc/mapred/OrcValue.html">OrcValue</a> |
| for the shuffle value.</p> |
| |
| <p>To send two OrcStructs through the shuffle, define the following properties |
| in the JobConf:</p> |
| |
| <ul> |
| <li><strong>mapreduce.map.output.key.class</strong> = org.apache.orc.mapred.OrcKey</li> |
| <li><strong>orc.mapred.map.output.key.schema</strong> = the shuffle key’s schema</li> |
| <li><strong>mapreduce.map.output.value.class</strong> = org.apache.orc.mapred.OrcValue</li> |
| <li><strong>orc.mapred.map.output.value.schema</strong> = the shuffle value’s schema</li> |
| </ul> |
| |
| <p>The mapper just adds an OrcKey and OrcWrapper around the key and value |
| respectively. These objects should be created once and reused as the mapper |
| runs.</p> |
| |
| <div class="language-java highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">MyMapperShuffle</span> |
| <span class="kd">extends</span> <span class="nc">Mapper</span><span class="o"><</span><span class="nc">NullWritable</span><span class="o">,</span><span class="nc">OrcStruct</span><span class="o">,</span><span class="nc">OrcKey</span><span class="o">,</span><span class="nc">OrcValue</span><span class="o">></span> <span class="o">{</span> |
| <span class="kd">private</span> <span class="nc">OrcKey</span> <span class="n">keyWrapper</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">OrcKey</span><span class="o">();</span> |
| <span class="kd">private</span> <span class="nc">OrcValue</span> <span class="n">valueWrapper</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">OrcValue</span><span class="o">();</span> |
| <span class="kd">private</span> <span class="nc">OrcStruct</span> <span class="n">outStruct</span> <span class="o">=</span> <span class="o">(</span><span class="nc">OrcStruct</span><span class="o">)</span> <span class="nc">OrcStruct</span><span class="o">.</span><span class="na">createValue</span> |
| <span class="o">(</span><span class="nc">TypeDescription</span><span class="o">.</span><span class="na">fromString</span><span class="o">(</span><span class="s">"struct<i:int,j:int>"</span><span class="o">));</span> |
| <span class="kd">private</span> <span class="nc">IntWritable</span> <span class="n">i</span> <span class="o">=</span> <span class="o">(</span><span class="nc">IntWritable</span><span class="o">)</span> <span class="n">outStruct</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="s">"i"</span><span class="o">);</span> |
| <span class="kd">private</span> <span class="nc">IntWritable</span> <span class="n">j</span> <span class="o">=</span> <span class="o">(</span><span class="nc">IntWritable</span><span class="o">)</span> <span class="n">outStruct</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="s">"j"</span><span class="o">);</span> |
| |
| <span class="c1">// Assume the input has type: struct<s:string,i:int></span> |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">map</span><span class="o">(</span><span class="nc">NullWritable</span> <span class="n">key</span><span class="o">,</span> <span class="nc">OrcStruct</span> <span class="n">value</span><span class="o">,</span> |
| <span class="nc">Context</span> <span class="n">output</span><span class="o">)</span> <span class="kd">throws</span> <span class="nc">IOException</span><span class="o">,</span> <span class="nc">InterruptedException</span> <span class="o">{</span> |
| <span class="n">keyWrapper</span><span class="o">.</span><span class="na">key</span> <span class="o">=</span> <span class="n">value</span><span class="o">;</span> |
| <span class="n">valueWrapper</span><span class="o">.</span><span class="na">value</span> <span class="o">=</span> <span class="n">outStruct</span><span class="o">;</span> |
| <span class="kt">int</span> <span class="n">val</span> <span class="o">=</span> <span class="o">((</span><span class="nc">IntWritable</span><span class="o">)</span> <span class="n">value</span><span class="o">.</span><span class="na">getFieldValue</span><span class="o">(</span><span class="s">"i"</span><span class="o">)).</span><span class="na">get</span><span class="o">();</span> |
| <span class="n">i</span><span class="o">.</span><span class="na">set</span><span class="o">(</span><span class="n">val</span> <span class="o">*</span> <span class="mi">2</span><span class="o">);</span> |
| <span class="n">j</span><span class="o">.</span><span class="na">set</span><span class="o">(</span><span class="n">val</span> <span class="o">*</span> <span class="n">val</span><span class="o">);</span> |
| <span class="n">output</span><span class="o">.</span><span class="na">write</span><span class="o">(</span><span class="n">keyWrapper</span><span class="o">,</span> <span class="n">valueWrapper</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </code></pre></div></div> |
| |
| <p>The reducer code accesses the underlying OrcStructs by using the |
| OrcKey.key and OrcValue.value fields.</p> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div class="section-nav"> |
| <div class="left align-right"> |
| |
| |
| |
| <a href="/docs/mapred.html" class="prev">Back</a> |
| |
| </div> |
| <div class="right align-left"> |
| |
| |
| |
| <a href="/docs/core-java.html" class="next">Next</a> |
| |
| </div> |
| </div> |
| <div class="clear"></div> |
| |
| |
| </article> |
| </div> |
| |
| <div class="unit one-fifth hide-on-mobiles"> |
| <aside> |
| |
| <h4>Overview</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/index.html">Background</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/adopters.html">ORC Adopters</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/types.html">Types</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/indexes.html">Indexes</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/acid.html">ACID support</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Installing</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/building.html">Building ORC</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Spark</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/spark-ddl.html">Spark DDL</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/spark-config.html">Spark Configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Python</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/pyarrow.html">PyArrow</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/dask.html">Dask</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in Hive</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using in MapReduce</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/mapred.html">Using in MapRed</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class="current"><a href="/docs/mapreduce.html">Using in MapReduce</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Using ORC Core</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/core-java.html">Using Core Java</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/core-cpp.html">Using Core C++</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/core-java-config.html">ORC Java configuration</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| <h4>Tools</h4> |
| |
| |
| <ul> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/cpp-tools.html">C++ Tools</a></li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class=""><a href="/docs/java-tools.html">Java Tools</a></li> |
| |
| |
| |
| </ul> |
| |
| |
| </aside> |
| </div> |
| |
| |
| <div class="clear"></div> |
| |
| </div> |
| </section> |
| |
| |
| <footer role="contentinfo"> |
| <p style="margin-left: 20px; margin-right; 20px; text-align: center">The contents of this website are © 2024 |
| <a href="https://www.apache.org/">Apache Software Foundation</a> |
| under the terms of the <a |
| href="https://www.apache.org/licenses/LICENSE-2.0.html"> |
| Apache License v2</a>. Apache ORC and its logo are trademarks |
| of the Apache Software Foundation.</p> |
| </footer> |
| |
| <script> |
| var anchorForId = function (id) { |
| var anchor = document.createElement("a"); |
| anchor.className = "header-link"; |
| anchor.href = "#" + id; |
| anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; |
| anchor.title = "Permalink"; |
| return anchor; |
| }; |
| |
| var linkifyAnchors = function (level, containingElement) { |
| var headers = containingElement.getElementsByTagName("h" + level); |
| for (var h = 0; h < headers.length; h++) { |
| var header = headers[h]; |
| |
| if (typeof header.id !== "undefined" && header.id !== "") { |
| header.appendChild(anchorForId(header.id)); |
| } |
| } |
| }; |
| |
| document.onreadystatechange = function () { |
| if (this.readyState === "complete") { |
| var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; |
| if (!contentBlock) { |
| return; |
| } |
| for (var level = 1; level <= 6; level++) { |
| linkifyAnchors(level, contentBlock); |
| } |
| } |
| }; |
| </script> |
| |
| |
| </body> |
| </html> |