blob: df7a621716cbe994ff08bc426d7ef485afd11575 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.orc;
import org.apache.orc.impl.InStream;
import org.apache.orc.impl.KeyProvider;
import org.apache.orc.impl.MemoryManagerImpl;
import org.apache.orc.impl.OrcCodecPool;
import org.apache.orc.impl.WriterImpl;
import org.apache.orc.OrcFile.WriterOptions;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.reader.ReaderEncryption;
import org.apache.orc.impl.reader.StripePlanner;
import org.junit.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.impl.DataReaderProperties;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.impl.RecordReaderImpl;
import org.apache.orc.impl.RecordReaderUtils;
import org.junit.Assume;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.mockito.Mockito;
import javax.xml.bind.DatatypeConverter;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.TimeZone;
import java.util.function.IntFunction;
import static junit.framework.TestCase.assertNotNull;
import static org.junit.Assert.*;
* Tests for the vectorized reader and writer for ORC files.
public class TestVectorOrcFile {
public OrcFile.Version fileFormat;
public static Collection<Object[]> getParameters() {
OrcFile.Version[] params = new OrcFile.Version[]{
List<Object[]> result = new ArrayList<>();
for(OrcFile.Version v: params) {
result.add(new Object[]{v});
return result;
public static String getFileFromClasspath(String name) {
URL url = ClassLoader.getSystemResource(name);
if (url == null) {
throw new IllegalArgumentException("Could not find " + name);
return url.getPath();
public static class InnerStruct {
int int1;
Text string1 = new Text();
InnerStruct(int int1, Text string1) {
this.int1 = int1;
InnerStruct(int int1, String string1) {
this.int1 = int1;
public String toString() {
return "{" + int1 + ", " + string1 + "}";
public static class MiddleStruct {
List<InnerStruct> list = new ArrayList<InnerStruct>();
MiddleStruct(InnerStruct... items) {
private static InnerStruct inner(int i, String s) {
return new InnerStruct(i, s);
private static Map<String, InnerStruct> map(InnerStruct... items) {
Map<String, InnerStruct> result = new HashMap<String, InnerStruct>();
for(InnerStruct i: items) {
result.put(i.string1.toString(), i);
return result;
private static List<InnerStruct> list(InnerStruct... items) {
List<InnerStruct> result = new ArrayList<InnerStruct>();
return result;
private static BytesWritable bytes(int... items) {
BytesWritable result = new BytesWritable();
for(int i=0; i < items.length; ++i) {
result.getBytes()[i] = (byte) items[i];
return result;
private static byte[] bytesArray(int... items) {
byte[] result = new byte[items.length];
for(int i=0; i < items.length; ++i) {
result[i] = (byte) items[i];
return result;
private static ByteBuffer byteBuf(int... items) {
ByteBuffer result = ByteBuffer.allocate(items.length);
for(int item: items) {
result.put((byte) item);
return result;
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
Configuration conf;
FileSystem fs;
Path testFilePath;
public TestName testCaseName = new TestName();
public void openFileSystem () throws Exception {
conf = new Configuration();
fs = FileSystem.getLocal(conf);
testFilePath = new Path(workDir, "TestVectorOrcFile." +
testCaseName.getMethodName().replaceFirst("\\[[0-9]+\\]", "")
+ "." + fileFormat.getName() + ".orc");
fs.delete(testFilePath, false);
public void testReadFormat_0_11() throws Exception {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
Path oldFilePath =
new Path(getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath,
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for(StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength()
+ stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength()
+ stripe.getFooterLength();
Assert.assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
((IntegerColumnStatistics) stats[5]).getMaximum());
((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
"count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// check the inspectors
TypeDescription schema = reader.getSchema();
assertEquals(TypeDescription.Category.STRUCT, schema.getCategory());
+ "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ "map:map<string,struct<int1:int,string1:string>>,ts:timestamp,"
+ "decimal1:decimal(38,10)>", schema.toString());
VectorizedRowBatch batch = schema.createRowBatch();
RecordReader rows = reader.rows();
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
// check the contents of the first row
assertEquals(false, getBoolean(batch, 0));
assertEquals(1, getByte(batch, 0));
assertEquals(1024, getShort(batch, 0));
assertEquals(65536, getInt(batch, 0));
assertEquals(Long.MAX_VALUE, getLong(batch, 0));
assertEquals(1.0, getFloat(batch, 0), 0.00001);
assertEquals(-15.0, getDouble(batch, 0), 0.00001);
assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0));
assertEquals("hi", getText(batch, 0).toString());
StructColumnVector middle = (StructColumnVector) batch.cols[9];
ListColumnVector midList = (ListColumnVector) middle.fields[0];
StructColumnVector midListStruct = (StructColumnVector) midList.child;
LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0];
BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1];
ListColumnVector list = (ListColumnVector) batch.cols[10];
StructColumnVector listStruct = (StructColumnVector) list.child;
LongColumnVector listInts = (LongColumnVector) listStruct.fields[0];
BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1];
MapColumnVector map = (MapColumnVector) batch.cols[11];
BytesColumnVector mapKey = (BytesColumnVector) map.keys;
StructColumnVector mapValue = (StructColumnVector) map.values;
LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0];
BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1];
TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12];
DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13];
assertEquals(false, middle.isNull[0]);
assertEquals(2, midList.lengths[0]);
int start = (int) midList.offsets[0];
assertEquals(1, midListInt.vector[start]);
assertEquals("bye", midListStr.toString(start));
assertEquals(2, midListInt.vector[start + 1]);
assertEquals("sigh", midListStr.toString(start + 1));
assertEquals(2, list.lengths[0]);
start = (int) list.offsets[0];
assertEquals(3, listInts.vector[start]);
assertEquals("good", listStrs.toString(start));
assertEquals(4, listInts.vector[start + 1]);
assertEquals("bad", listStrs.toString(start + 1));
assertEquals(0, map.lengths[0]);
assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")),
// check the contents of row 7499
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(true, getBoolean(batch, 0));
assertEquals(100, getByte(batch, 0));
assertEquals(2048, getShort(batch, 0));
assertEquals(65536, getInt(batch, 0));
assertEquals(Long.MAX_VALUE, getLong(batch, 0));
assertEquals(2.0, getFloat(batch, 0), 0.00001);
assertEquals(-5.0, getDouble(batch, 0), 0.00001);
assertEquals(bytes(), getBinary(batch, 0));
assertEquals("bye", getText(batch, 0).toString());
assertEquals(false, middle.isNull[0]);
assertEquals(2, midList.lengths[0]);
start = (int) midList.offsets[0];
assertEquals(1, midListInt.vector[start]);
assertEquals("bye", midListStr.toString(start));
assertEquals(2, midListInt.vector[start + 1]);
assertEquals("sigh", midListStr.toString(start + 1));
assertEquals(3, list.lengths[0]);
start = (int) list.offsets[0];
assertEquals(100000000, listInts.vector[start]);
assertEquals("cat", listStrs.toString(start));
assertEquals(-100000, listInts.vector[start + 1]);
assertEquals("in", listStrs.toString(start + 1));
assertEquals(1234, listInts.vector[start + 2]);
assertEquals("hat", listStrs.toString(start + 2));
assertEquals(2, map.lengths[0]);
start = (int) map.offsets[0];
assertEquals("chani", mapKey.toString(start));
assertEquals(5, mapValueInts.vector[start]);
assertEquals("chani", mapValueStrs.toString(start));
assertEquals("mauddib", mapKey.toString(start + 1));
assertEquals(1, mapValueInts.vector[start + 1]);
assertEquals("mauddib", mapValueStrs.toString(start + 1));
assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"),
assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")),
// handle the close up
Assert.assertEquals(false, rows.nextBatch(batch));
public void testTimestamp() throws Exception {
TypeDescription schema = TypeDescription.createTimestamp();
Writer writer = OrcFile.createWriter(testFilePath,
List<Timestamp> tslist = Lists.newArrayList();
tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999"));
tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222"));
tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999"));
tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888"));
tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1"));
tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001"));
tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229"));
tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003"));
tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007"));
tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809"));
tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643"));
tslist.add(Timestamp.valueOf("2008-10-02 00:00:00"));
VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
TimestampColumnVector vec = new TimestampColumnVector(1024);
batch.cols[0] = vec;
batch.size = tslist.size();
for (int i=0; i < tslist.size(); ++i) {
Timestamp ts = tslist.get(i);
vec.set(i, ts);
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0];
int idx = 0;
while (rows.nextBatch(batch)) {
for(int r=0; r < batch.size; ++r) {
Assert.assertEquals(tslist.size(), rows.getRowNumber());
assertEquals(0, writer.getSchema().getMaximumId());
boolean[] expected = new boolean[] {false};
boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
assertEquals(true, Arrays.equals(expected, included));
public void testStringAndBinaryStatistics() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
.addField("bytes1", TypeDescription.createBinary())
.addField("string1", TypeDescription.createString());
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 4;
BytesColumnVector field1 = (BytesColumnVector) batch.cols[0];
BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
field1.setVal(0, bytesArray(0, 1, 2, 3, 4));
field1.setVal(1, bytesArray(0, 1, 2, 3));
field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5));
field1.noNulls = false;
field1.isNull[3] = true;
field2.setVal(0, "foo".getBytes(StandardCharsets.UTF_8));
field2.setVal(1, "bar".getBytes(StandardCharsets.UTF_8));
field2.noNulls = false;
field2.isNull[2] = true;
field2.setVal(3, "hi".getBytes(StandardCharsets.UTF_8));
schema = writer.getSchema();
assertEquals(2, schema.getMaximumId());
Reader reader = OrcFile.createReader(testFilePath,
boolean[] expected = new boolean[] {false, false, true};
boolean[] included = OrcUtils.includeColumns("string1", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] {false, false, false};
included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] {false, false, false};
included = OrcUtils.includeColumns(null, schema);
assertEquals(true, Arrays.equals(expected, included));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertArrayEquals(stats, writer.getStatistics());
assertEquals(4, stats[0].getNumberOfValues());
assertEquals("count: 4 hasNull: false", stats[0].toString());
assertEquals(3, stats[1].getNumberOfValues());
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
assertEquals(3, stats[2].getNumberOfValues());
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
assertEquals("count: 3 hasNull: true bytesOnDisk: " +
(fileFormat == OrcFile.Version.V_0_11 ? "30" : "22") +
" min: bar max: hi sum: 8",
// check the inspectors
batch = reader.getSchema().createRowBatch();
BytesColumnVector bytes = (BytesColumnVector) batch.cols[0];
BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
RecordReader rows = reader.rows();
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(4, batch.size);
// check the contents of the first row
assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0));
assertEquals("foo", strs.toString(0));
// check the contents of second row
assertEquals(bytes(0,1,2,3), getBinary(bytes, 1));
assertEquals("bar", strs.toString(1));
// check the contents of third row
assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2));
// check the contents of fourth row
assertNull(getBinary(bytes, 3));
assertEquals("hi", strs.toString(3));
// handle the close up
Assert.assertEquals(false, rows.nextBatch(batch));
public void testHiveDecimalStatsAllNulls() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
.addField("dec1", TypeDescription.createDecimal());
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 4;
DecimalColumnVector field1 = (DecimalColumnVector) batch.cols[0];
field1.noNulls = false;
field1.isNull[0] = true;
field1.isNull[1] = true;
field1.isNull[2] = true;
field1.isNull[3] = true;
Reader reader = OrcFile.createReader(testFilePath,
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
Assert.assertEquals(4, stats[0].getNumberOfValues());
Assert.assertEquals(0, stats[1].getNumberOfValues());
Assert.assertEquals(true, stats[1].hasNull());
Assert.assertEquals(new HiveDecimalWritable(0).getHiveDecimal(), ((DecimalColumnStatistics)stats[1]).getSum());
public void testStripeLevelStats() throws Exception {
TypeDescription schema =
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1000;
LongColumnVector field1 = (LongColumnVector) batch.cols[0];
BytesColumnVector field2 = (BytesColumnVector) batch.cols[1];
field1.isRepeating = true;
field2.isRepeating = true;
for (int b = 0; b < 11; b++) {
if (b >= 5) {
if (b >= 10) {
field1.vector[0] = 3;
field2.setVal(0, "three".getBytes(StandardCharsets.UTF_8));
} else {
field1.vector[0] = 2;
field2.setVal(0, "two".getBytes(StandardCharsets.UTF_8));
} else {
field1.vector[0] = 1;
field2.setVal(0, "one".getBytes(StandardCharsets.UTF_8));
Reader reader = OrcFile.createReader(testFilePath,
schema = writer.getSchema();
assertEquals(2, schema.getMaximumId());
boolean[] expected = new boolean[] {false, true, false};
boolean[] included = OrcUtils.includeColumns("int1", schema);
assertEquals(true, Arrays.equals(expected, included));
List<StripeStatistics> stats = reader.getStripeStatistics();
int numStripes = stats.size();
assertEquals(3, numStripes);
StripeStatistics ss1 = stats.get(0);
StripeStatistics ss2 = stats.get(1);
StripeStatistics ss3 = stats.get(2);
assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum());
assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum());
assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum());
assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum());
assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum());
assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum());
assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum());
assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum());
assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum());
assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum());
assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum());
assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum());
assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum());
assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum());
assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum());
assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum());
assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum());
RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
assertEquals(3, index.length);
List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
assertEquals(1, items.size());
assertEquals(3, items.get(0).getPositionsCount());
assertEquals(0, items.get(0).getPositions(0));
assertEquals(0, items.get(0).getPositions(1));
assertEquals(0, items.get(0).getPositions(2));
index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
assertEquals(3, index.length);
items = index[1].getEntryList();
private static void setInner(StructColumnVector inner, int rowId,
int i, String value) {
((LongColumnVector) inner.fields[0]).vector[rowId] = i;
if (value != null) {
((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes(StandardCharsets.UTF_8));
} else {
inner.fields[1].isNull[rowId] = true;
inner.fields[1].noNulls = false;
private static void checkInner(StructColumnVector inner, int rowId,
int rowInBatch, int i, String value) {
assertEquals("row " + rowId, i,
((LongColumnVector) inner.fields[0]).vector[rowInBatch]);
if (value != null) {
assertEquals("row " + rowId, value,
((BytesColumnVector) inner.fields[1]).toString(rowInBatch));
} else {
assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]);
assertEquals("row " + rowId, false, inner.fields[1].noNulls);
private static void setInnerList(ListColumnVector list, int rowId,
List<InnerStruct> value) {
if (value != null) {
if (list.childCount + value.size() > list.child.isNull.length) {
list.child.ensureSize(list.childCount * 2, true);
list.lengths[rowId] = value.size();
list.offsets[rowId] = list.childCount;
for (int i = 0; i < list.lengths[rowId]; ++i) {
InnerStruct inner = value.get(i);
setInner((StructColumnVector) list.child, i + list.childCount,
inner.int1, inner.string1.toString());
list.childCount += value.size();
} else {
list.isNull[rowId] = true;
list.noNulls = false;
private static void checkInnerList(ListColumnVector list, int rowId,
int rowInBatch, List<InnerStruct> value) {
if (value != null) {
assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]);
int start = (int) list.offsets[rowInBatch];
for (int i = 0; i < list.lengths[rowInBatch]; ++i) {
InnerStruct inner = value.get(i);
checkInner((StructColumnVector) list.child, rowId, i + start,
inner.int1, inner.string1.toString());
list.childCount += value.size();
} else {
assertEquals("row " + rowId, true, list.isNull[rowInBatch]);
assertEquals("row " + rowId, false, list.noNulls);
private static void setInnerMap(MapColumnVector map, int rowId,
Map<String, InnerStruct> value) {
if (value != null) {
if (map.childCount >= map.keys.isNull.length) {
map.keys.ensureSize(map.childCount * 2, true);
map.values.ensureSize(map.childCount * 2, true);
map.lengths[rowId] = value.size();
int offset = map.childCount;
map.offsets[rowId] = offset;
for (Map.Entry<String, InnerStruct> entry : value.entrySet()) {
((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes(StandardCharsets.UTF_8));
InnerStruct inner = entry.getValue();
setInner((StructColumnVector) map.values, offset, inner.int1,
offset += 1;
map.childCount = offset;
} else {
map.isNull[rowId] = true;
map.noNulls = false;
private static void checkInnerMap(MapColumnVector map, int rowId,
int rowInBatch,
Map<String, InnerStruct> value) {
if (value != null) {
assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]);
int offset = (int) map.offsets[rowInBatch];
for(int i=0; i < value.size(); ++i) {
String key = ((BytesColumnVector) map.keys).toString(offset + i);
InnerStruct expected = value.get(key);
checkInner((StructColumnVector) map.values, rowId, offset + i,
expected.int1, expected.string1.toString());
} else {
assertEquals("row " + rowId, true, map.isNull[rowId]);
assertEquals("row " + rowId, false, map.noNulls);
private static void setMiddleStruct(StructColumnVector middle, int rowId,
MiddleStruct value) {
if (value != null) {
setInnerList((ListColumnVector) middle.fields[0], rowId, value.list);
} else {
middle.isNull[rowId] = true;
middle.noNulls = false;
private static void checkMiddleStruct(StructColumnVector middle, int rowId,
int rowInBatch, MiddleStruct value) {
if (value != null) {
checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch,
} else {
assertEquals("row " + rowId, true, middle.isNull[rowInBatch]);
assertEquals("row " + rowId, false, middle.noNulls);
private static void setBigRow(VectorizedRowBatch batch, int rowId,
Boolean b1, Byte b2, Short s1,
Integer i1, Long l1, Float f1,
Double d1, BytesWritable b3, String s2,
MiddleStruct m1, List<InnerStruct> l2,
Map<String, InnerStruct> m2) {
((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0;
((LongColumnVector) batch.cols[1]).vector[rowId] = b2;
((LongColumnVector) batch.cols[2]).vector[rowId] = s1;
((LongColumnVector) batch.cols[3]).vector[rowId] = i1;
((LongColumnVector) batch.cols[4]).vector[rowId] = l1;
((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1;
((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1;
if (b3 != null) {
((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0,
} else {
batch.cols[7].isNull[rowId] = true;
batch.cols[7].noNulls = false;
if (s2 != null) {
((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes(StandardCharsets.UTF_8));
} else {
batch.cols[8].isNull[rowId] = true;
batch.cols[8].noNulls = false;
setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1);
setInnerList((ListColumnVector) batch.cols[10], rowId, l2);
setInnerMap((MapColumnVector) batch.cols[11], rowId, m2);
private static void checkBigRow(VectorizedRowBatch batch,
int rowInBatch,
int rowId,
boolean b1, byte b2, short s1,
int i1, long l1, float f1,
double d1, BytesWritable b3, String s2,
MiddleStruct m1, List<InnerStruct> l2,
Map<String, InnerStruct> m2) {
assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch));
assertEquals("row " + rowId, b2, getByte(batch, rowInBatch));
assertEquals("row " + rowId, s1, getShort(batch, rowInBatch));
assertEquals("row " + rowId, i1, getInt(batch, rowInBatch));
assertEquals("row " + rowId, l1, getLong(batch, rowInBatch));
assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001);
assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001);
if (b3 != null) {
BytesColumnVector bytes = (BytesColumnVector) batch.cols[7];
assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]);
for(int i=0; i < b3.getLength(); ++i) {
assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i],
bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]);
} else {
assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]);
assertEquals("row " + rowId, false, batch.cols[7].noNulls);
if (s2 != null) {
assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString());
} else {
assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]);
assertEquals("row " + rowId, false, batch.cols[8].noNulls);
checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch,
checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2);
checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2);
private static boolean getBoolean(VectorizedRowBatch batch, int rowId) {
return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0;
private static byte getByte(VectorizedRowBatch batch, int rowId) {
return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId];
private static short getShort(VectorizedRowBatch batch, int rowId) {
return (short) ((LongColumnVector) batch.cols[2]).vector[rowId];
private static int getInt(VectorizedRowBatch batch, int rowId) {
return (int) ((LongColumnVector) batch.cols[3]).vector[rowId];
private static long getLong(VectorizedRowBatch batch, int rowId) {
return ((LongColumnVector) batch.cols[4]).vector[rowId];
private static float getFloat(VectorizedRowBatch batch, int rowId) {
return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId];
private static double getDouble(VectorizedRowBatch batch, int rowId) {
return ((DoubleColumnVector) batch.cols[6]).vector[rowId];
private static BytesWritable getBinary(BytesColumnVector column, int rowId) {
if (column.isRepeating) {
rowId = 0;
if (column.noNulls || !column.isNull[rowId]) {
return new BytesWritable(Arrays.copyOfRange(column.vector[rowId],
column.start[rowId], column.start[rowId] + column.length[rowId]));
} else {
return null;
private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) {
return getBinary((BytesColumnVector) batch.cols[7], rowId);
private static Text getText(BytesColumnVector vector, int rowId) {
if (vector.isRepeating) {
rowId = 0;
if (vector.noNulls || !vector.isNull[rowId]) {
return new Text(Arrays.copyOfRange(vector.vector[rowId],
vector.start[rowId], vector.start[rowId] + vector.length[rowId]));
} else {
return null;
private static Text getText(VectorizedRowBatch batch, int rowId) {
return getText((BytesColumnVector) batch.cols[8], rowId);
private static InnerStruct getInner(StructColumnVector vector,
int rowId) {
return new InnerStruct(
(int) ((LongColumnVector) vector.fields[0]).vector[rowId],
getText((BytesColumnVector) vector.fields[1], rowId));
private static List<InnerStruct> getList(ListColumnVector cv,
int rowId) {
if (cv.isRepeating) {
rowId = 0;
if (cv.noNulls || !cv.isNull[rowId]) {
List<InnerStruct> result =
new ArrayList<InnerStruct>((int) cv.lengths[rowId]);
for(long i=cv.offsets[rowId];
i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
result.add(getInner((StructColumnVector) cv.child, (int) i));
return result;
} else {
return null;
private static List<InnerStruct> getMidList(VectorizedRowBatch batch,
int rowId) {
return getList((ListColumnVector) ((StructColumnVector) batch.cols[9])
.fields[0], rowId);
private static List<InnerStruct> getList(VectorizedRowBatch batch,
int rowId) {
return getList((ListColumnVector) batch.cols[10], rowId);
private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch,
int rowId) {
MapColumnVector cv = (MapColumnVector) batch.cols[11];
if (cv.isRepeating) {
rowId = 0;
if (cv.noNulls || !cv.isNull[rowId]) {
Map<Text, InnerStruct> result =
new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]);
for(long i=cv.offsets[rowId];
i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) {
result.put(getText((BytesColumnVector) cv.keys, (int) i),
getInner((StructColumnVector) cv.values, (int) i));
return result;
} else {
return null;
private static TypeDescription createInnerSchema() {
return TypeDescription.fromString("struct<int1:int,string1:string>");
private static TypeDescription createComplexInnerSchema()
return TypeDescription.fromString("struct<int1:int,"
+ "complex:struct<int2:int,String1:string>>");
private static TypeDescription createBigRowSchema() {
return TypeDescription.createStruct()
.addField("boolean1", TypeDescription.createBoolean())
.addField("byte1", TypeDescription.createByte())
.addField("short1", TypeDescription.createShort())
.addField("int1", TypeDescription.createInt())
.addField("long1", TypeDescription.createLong())
.addField("float1", TypeDescription.createFloat())
.addField("double1", TypeDescription.createDouble())
.addField("bytes1", TypeDescription.createBinary())
.addField("string1", TypeDescription.createString())
.addField("middle", TypeDescription.createStruct()
.addField("list", TypeDescription.createList(createInnerSchema())))
.addField("list", TypeDescription.createList(createInnerSchema()))
.addField("map", TypeDescription.createMap(
static void assertArrayBooleanEquals(boolean[] expected, boolean[] actual) {
assertEquals(expected.length, actual.length);
boolean diff = false;
for(int i=0; i < expected.length; ++i) {
if (expected[i] != actual[i]) {
System.out.println("Difference at " + i + " expected: " + expected[i] +
" actual: " + actual[i]);
diff = true;
assertEquals(false, diff);
public void test1() throws Exception {
TypeDescription schema = createBigRowSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 2;
setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536,
Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(3, "good"), inner(4, "bad")),
setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536,
Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
map(inner(5, "chani"), inner(1, "mauddib")));
ColumnStatistics[] closeStatistics = writer.getStatistics();
Reader reader = OrcFile.createReader(testFilePath,
schema = writer.getSchema();
assertEquals(23, schema.getMaximumId());
boolean[] expected = new boolean[] {false, false, false, false, false,
false, false, false, false, false,
false, false, false, false, false,
false, false, false, false, false,
false, false, false, false};
boolean[] included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] {false, true, false, false, false,
false, false, false, false, true,
true, true, true, true, true,
false, false, false, false, true,
true, true, true, true};
included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
assertArrayBooleanEquals(expected, included);
expected = new boolean[] {false, true, false, false, false,
false, false, false, false, true,
true, true, true, true, true,
false, false, false, false, true,
true, true, true, true};
included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
assertArrayBooleanEquals(expected, included);
expected = new boolean[] {false, true, true, true, true,
true, true, true, true, true,
true, true, true, true, true,
true, true, true, true, true,
true, true, true, true};
included = OrcUtils.includeColumns(
assertEquals(true, Arrays.equals(expected, included));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertArrayEquals(stats, closeStatistics);
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false bytesOnDisk: 5 true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false bytesOnDisk: " +
(fileFormat == OrcFile.Version.V_0_11 ? "8" : "9") +
" min: 1024 max: 2048 sum: 3072", stats[3].toString());
StripeStatistics ss = reader.getStripeStatistics().get(0);
assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001);
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001);
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0",
assertEquals("count: 2 hasNull: false bytesOnDisk: " +
(fileFormat == OrcFile.Version.V_0_11 ? "20" : "14") +
" min: bye max: hi sum: 5", stats[9].toString());
// check the schema
TypeDescription readerSchema = reader.getSchema();
assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory());
+ "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ "map:map<string,struct<int1:int,string1:string>>>",
List<String> fieldNames = readerSchema.getFieldNames();
List<TypeDescription> fieldTypes = readerSchema.getChildren();
assertEquals("boolean1", fieldNames.get(0));
assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory());
assertEquals("byte1", fieldNames.get(1));
assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory());
assertEquals("short1", fieldNames.get(2));
assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory());
assertEquals("int1", fieldNames.get(3));
assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory());
assertEquals("long1", fieldNames.get(4));
assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory());
assertEquals("float1", fieldNames.get(5));
assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory());
assertEquals("double1", fieldNames.get(6));
assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory());
assertEquals("bytes1", fieldNames.get(7));
assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory());
assertEquals("string1", fieldNames.get(8));
assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory());
assertEquals("middle", fieldNames.get(9));
TypeDescription middle = fieldTypes.get(9);
assertEquals(TypeDescription.Category.STRUCT, middle.getCategory());
TypeDescription midList = middle.getChildren().get(0);
assertEquals(TypeDescription.Category.LIST, midList.getCategory());
TypeDescription inner = midList.getChildren().get(0);
assertEquals(TypeDescription.Category.STRUCT, inner.getCategory());
assertEquals("int1", inner.getFieldNames().get(0));
assertEquals("string1", inner.getFieldNames().get(1));
RecordReader rows = reader.rows();
// create a new batch
batch = readerSchema.createRowBatch();
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(2, batch.size);
Assert.assertEquals(false, rows.nextBatch(batch));
// check the contents of the first row
assertEquals(false, getBoolean(batch, 0));
assertEquals(1, getByte(batch, 0));
assertEquals(1024, getShort(batch, 0));
assertEquals(65536, getInt(batch, 0));
assertEquals(Long.MAX_VALUE, getLong(batch, 0));
assertEquals(1.0, getFloat(batch, 0), 0.00001);
assertEquals(-15.0, getDouble(batch, 0), 0.00001);
assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0));
assertEquals("hi", getText(batch, 0).toString());
List<InnerStruct> midRow = getMidList(batch, 0);
assertEquals(2, midRow.size());
assertEquals(1, midRow.get(0).int1);
assertEquals("bye", midRow.get(0).string1.toString());
assertEquals(2, midRow.get(1).int1);
assertEquals("sigh", midRow.get(1).string1.toString());
List<InnerStruct> list = getList(batch, 0);
assertEquals(2, list.size());
assertEquals(3, list.get(0).int1);
assertEquals("good", list.get(0).string1.toString());
assertEquals(4, list.get(1).int1);
assertEquals("bad", list.get(1).string1.toString());
Map<Text, InnerStruct> map = getMap(batch, 0);
assertEquals(0, map.size());
// check the contents of second row
assertEquals(true, getBoolean(batch, 1));
assertEquals(100, getByte(batch, 1));
assertEquals(2048, getShort(batch, 1));
assertEquals(65536, getInt(batch, 1));
assertEquals(Long.MAX_VALUE, getLong(batch, 1));
assertEquals(2.0, getFloat(batch, 1), 0.00001);
assertEquals(-5.0, getDouble(batch, 1), 0.00001);
assertEquals(bytes(), getBinary(batch, 1));
assertEquals("bye", getText(batch, 1).toString());
midRow = getMidList(batch, 1);
assertEquals(2, midRow.size());
assertEquals(1, midRow.get(0).int1);
assertEquals("bye", midRow.get(0).string1.toString());
assertEquals(2, midRow.get(1).int1);
assertEquals("sigh", midRow.get(1).string1.toString());
list = getList(batch, 1);
assertEquals(3, list.size());
assertEquals(100000000, list.get(0).int1);
assertEquals("cat", list.get(0).string1.toString());
assertEquals(-100000, list.get(1).int1);
assertEquals("in", list.get(1).string1.toString());
assertEquals(1234, list.get(2).int1);
assertEquals("hat", list.get(2).string1.toString());
map = getMap(batch, 1);
assertEquals(2, map.size());
InnerStruct value = map.get(new Text("chani"));
assertEquals(5, value.int1);
assertEquals("chani", value.string1.toString());
value = map.get(new Text("mauddib"));
assertEquals(1, value.int1);
assertEquals("mauddib", value.string1.toString());
// handle the close up
Assert.assertEquals(false, rows.nextBatch(batch));
static void assertEmptyStats(ColumnStatistics[] writerStatistics) {
for (ColumnStatistics columnStatistics : writerStatistics){
assertEquals(0, columnStatistics.getNumberOfValues());
public void testColumnProjection() throws Exception {
TypeDescription schema = createInnerSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random r1 = new Random(1);
Random r2 = new Random(2);
int x;
int minInt=0, maxInt=0;
String y;
String minStr = null, maxStr = null;
batch.size = 1000;
boolean first = true;
for(int b=0; b < 21; ++b) {
for(int r=0; r < 1000; ++r) {
x = r1.nextInt();
y = Long.toHexString(r2.nextLong());
if (first || x < minInt) {
minInt = x;
if (first || x > maxInt) {
maxInt = x;
if (first || y.compareTo(minStr) < 0) {
minStr = y;
if (first || y.compareTo(maxStr) > 0) {
maxStr = y;
first = false;
((LongColumnVector) batch.cols[0]).vector[r] = x;
((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes(StandardCharsets.UTF_8));
Reader reader = OrcFile.createReader(testFilePath,
// check out the statistics
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(3, stats.length);
for(ColumnStatistics s: stats) {
assertEquals(21000, s.getNumberOfValues());
if (s instanceof IntegerColumnStatistics) {
assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
} else if (s instanceof StringColumnStatistics) {
assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
// check out the types
TypeDescription type = reader.getSchema();
assertEquals(TypeDescription.Category.STRUCT, type.getCategory());
assertEquals(2, type.getChildren().size());
TypeDescription type1 = type.getChildren().get(0);
TypeDescription type2 = type.getChildren().get(1);
assertEquals(TypeDescription.Category.INT, type1.getCategory());
assertEquals(TypeDescription.Category.STRING, type2.getCategory());
assertEquals("struct<int1:int,string1:string>", type.toString());
// read the contents and make sure they match
RecordReader rows1 = reader.rows(
reader.options().include(new boolean[]{true, true, false}));
RecordReader rows2 = reader.rows(
reader.options().include(new boolean[]{true, false, true}));
r1 = new Random(1);
r2 = new Random(2);
VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000);
VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000);
for(int i = 0; i < 21000; i += 1000) {
Assert.assertEquals(true, rows1.nextBatch(batch1));
Assert.assertEquals(true, rows2.nextBatch(batch2));
assertEquals(1000, batch1.size);
assertEquals(1000, batch2.size);
for(int j=0; j < 1000; ++j) {
((LongColumnVector) batch1.cols[0]).vector[j]);
((BytesColumnVector) batch2.cols[1]).toString(j));
Assert.assertEquals(false, rows1.nextBatch(batch1));
Assert.assertEquals(false, rows2.nextBatch(batch2));
public void testEmptyFile() throws Exception {
TypeDescription schema = createBigRowSchema();
Writer writer = OrcFile.createWriter(testFilePath,
Reader reader = OrcFile.createReader(testFilePath,
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
Assert.assertEquals(false, reader.rows().nextBatch(batch));
Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind());
Assert.assertEquals(0, reader.getNumberOfRows());
Assert.assertEquals(0, reader.getCompressionSize());
Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
Assert.assertEquals(3, reader.getContentLength());
Assert.assertEquals(false, reader.getStripes().iterator().hasNext());
public void metaData() throws Exception {
TypeDescription schema = createBigRowSchema();
Writer writer = OrcFile.createWriter(testFilePath,
writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127,
writer.addUserMetadata("clobber", byteBuf(1, 2, 3));
writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1));
ByteBuffer bigBuf = ByteBuffer.allocate(40000);
Random random = new Random(0);
writer.addUserMetadata("big", bigBuf);
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1;
setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42,
42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
null, null, null, null);
writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
Reader reader = OrcFile.createReader(testFilePath,
Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber"));
Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128),
Assert.assertEquals(bigBuf, reader.getMetadataValue("big"));
try {
} catch (IllegalArgumentException iae) {
int i = 0;
for(String key: reader.getMetadataKeys()) {
if ("my.meta".equals(key) ||
"clobber".equals(key) ||
"big".equals(key)) {
i += 1;
} else {
throw new IllegalArgumentException("unknown key " + key);
assertEquals(3, i);
int numStripes = reader.getStripeStatistics().size();
assertEquals(1, numStripes);
* Generate an ORC file with a range of dates and times.
public void createOrcDateFile(Path file, int minYear, int maxYear
) throws IOException {
TypeDescription schema = TypeDescription.createStruct()
.addField("time", TypeDescription.createTimestamp())
.addField("date", TypeDescription.createDate());
Writer writer = OrcFile.createWriter(file,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1000;
TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
for (int year = minYear; year < maxYear; ++year) {
for (int row = 0; row < 1000; ++row) {
String timeStr = String.format("%04d-05-05 12:34:56.%04d", year, 2*row);
timestampColVector.set(row, Timestamp.valueOf(timeStr));
((LongColumnVector) batch.cols[1]).vector[0] =
new DateWritable(new Date(year - 1900, 11, 25)).getDays();
batch.cols[1].isRepeating = true;
// add one more row to check the statistics for the jvm bug case
batch.size = 1;
String timeStr = String.format("%04d-12-12 12:34:56.0001", maxYear-1);
timestampColVector.set(0, Timestamp.valueOf(timeStr));
// check the stats to make sure they match up to the millisecond
ColumnStatistics[] stats = writer.getStatistics();
TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1];
assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1),
assertEquals(String.format("%04d-05-05 12:34:56.0", minYear),
// read back the rows
Reader reader = OrcFile.createReader(file,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1000);
TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
LongColumnVector dates = (LongColumnVector) batch.cols[1];
for (int year = minYear; year < maxYear; ++year) {
assertEquals(1000, batch.size);
for(int row = 0; row < 1000; ++row) {
Timestamp expected = Timestamp.valueOf(
String.format("%04d-05-05 12:34:56.%04d", year, 2*row));
assertEquals("ms row " + row + " " + expected, expected.getTime(),
assertEquals("nanos row " + row + " " + expected, expected.getNanos(),
assertEquals("year " + year + " row " + row,
Integer.toString(year) + "-12-25",
new DateWritable((int) dates.vector[row]).toString());
assertEquals(1, batch.size);
public void testDate1900() throws Exception {
createOrcDateFile(testFilePath, 1900, 1970);
public void testDate2038() throws Exception {
createOrcDateFile(testFilePath, 2038, 2250);
private static void setUnion(VectorizedRowBatch batch, int rowId,
Timestamp ts, Integer tag, Integer i, String s,
HiveDecimalWritable dec, Timestamp instant) {
UnionColumnVector union = (UnionColumnVector) batch.cols[1];
if (ts != null) {
TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0];
timestampColVector.set(rowId, ts);
} else {
batch.cols[0].isNull[rowId] = true;
batch.cols[0].noNulls = false;
if (tag != null) {
union.tags[rowId] = tag;
if (tag == 0) {
if (i != null) {
((LongColumnVector) union.fields[tag]).vector[rowId] = i;
} else {
union.fields[tag].isNull[rowId] = true;
union.fields[tag].noNulls = false;
} else if (tag == 1) {
if (s != null) {
((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes(StandardCharsets.UTF_8));
} else {
union.fields[tag].isNull[rowId] = true;
union.fields[tag].noNulls = false;
} else {
throw new IllegalArgumentException("Bad tag " + tag);
} else {
batch.cols[1].isNull[rowId] = true;
batch.cols[1].noNulls = false;
if (dec != null) {
((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec;
} else {
batch.cols[2].isNull[rowId] = true;
batch.cols[2].noNulls = false;
if (instant == null) {
batch.cols[3].isNull[rowId] = true;
batch.cols[3].noNulls = false;
} else {
((TimestampColumnVector) batch.cols[3]).set(rowId, instant);
* Test writing with the new decimal and reading with the new and old.
public void testDecimal64Writing() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<d:decimal(18,3)>");
VectorizedRowBatch batch = schema.createRowBatchV2();
Writer writer = OrcFile.createWriter(testFilePath,
Decimal64ColumnVector cv = (Decimal64ColumnVector) batch.cols[0];
cv.precision = 18;
cv.scale = 3;
cv.vector[0] = 1;
for(int r=1; r < 18; r++) {
cv.vector[r] = cv.vector[r-1] * 10;
cv.vector[18] = -2000;
batch.size = 19;
Reader reader = OrcFile.createReader(testFilePath,
assertEquals("count: 19 hasNull: false", reader.getStatistics()[0].toString());
// the size of the column in the different formats
int size = (fileFormat == OrcFile.Version.V_0_11 ? 89 :
fileFormat == OrcFile.Version.V_0_12 ? 90 : 154);
assertEquals("count: 19 hasNull: false bytesOnDisk: " + size +
" min: -2 max: 100000000000000 sum: 111111111111109.111",
RecordReader rows = reader.rows();
batch = schema.createRowBatchV2();
cv = (Decimal64ColumnVector) batch.cols[0];
assertEquals(19, batch.size);
assertEquals(18, cv.precision);
assertEquals(3, cv.scale);
assertEquals("row 0", 1, cv.vector[0]);
for(int r=1; r < 18; ++r) {
assertEquals("row " + r, 10 * cv.vector[r-1], cv.vector[r]);
assertEquals(-2000, cv.vector[18]);
// test with old batch
rows = reader.rows();
batch = schema.createRowBatch();
DecimalColumnVector oldCv = (DecimalColumnVector) batch.cols[0];
assertEquals(19, batch.size);
assertEquals(18, oldCv.precision);
assertEquals(3, oldCv.scale);
assertEquals("0.001", oldCv.vector[0].toString());
assertEquals("0.01", oldCv.vector[1].toString());
assertEquals("0.1", oldCv.vector[2].toString());
assertEquals("1", oldCv.vector[3].toString());
assertEquals("10", oldCv.vector[4].toString());
assertEquals("100", oldCv.vector[5].toString());
assertEquals("1000", oldCv.vector[6].toString());
assertEquals("10000", oldCv.vector[7].toString());
assertEquals("100000", oldCv.vector[8].toString());
assertEquals("1000000", oldCv.vector[9].toString());
assertEquals("10000000", oldCv.vector[10].toString());
assertEquals("100000000", oldCv.vector[11].toString());
assertEquals("1000000000", oldCv.vector[12].toString());
assertEquals("10000000000", oldCv.vector[13].toString());
assertEquals("100000000000", oldCv.vector[14].toString());
assertEquals("1000000000000", oldCv.vector[15].toString());
assertEquals("10000000000000", oldCv.vector[16].toString());
assertEquals("100000000000000", oldCv.vector[17].toString());
assertEquals("-2", oldCv.vector[18].toString());
* Test writing with the old decimal and reading with the new and old.
public void testDecimal64Reading() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<d:decimal(18,4)>");
VectorizedRowBatch batch = schema.createRowBatch();
Writer writer = OrcFile.createWriter(testFilePath,
DecimalColumnVector cv = (DecimalColumnVector) batch.cols[0];
cv.precision = 18;
cv.scale = 3;
long base = 1;
for(int r=0; r < 18; r++) {
cv.vector[r].setFromLongAndScale(base, 4);
base *= 10;
batch.size = 19;
// test with new batch
Reader reader = OrcFile.createReader(testFilePath,
assertEquals("count: 19 hasNull: false", reader.getStatistics()[0].toString());
// the size of the column in the different formats
int size = (fileFormat == OrcFile.Version.V_0_11 ? 63 :
fileFormat == OrcFile.Version.V_0_12 ? 65 : 154);
assertEquals("count: 19 hasNull: false bytesOnDisk: " + size +
" min: -2 max: 10000000000000 sum: 11111111111109.1111",
RecordReader rows = reader.rows();
batch = schema.createRowBatchV2();
Decimal64ColumnVector newCv = (Decimal64ColumnVector) batch.cols[0];
assertEquals(19, batch.size);
assertEquals(18, newCv.precision);
assertEquals(4, newCv.scale);
assertEquals("row 0", 1, newCv.vector[0]);
for(int r=1; r < 18; ++r) {
assertEquals("row " + r, 10 * newCv.vector[r-1], newCv.vector[r]);
assertEquals(-20000, newCv.vector[18]);
// test with old batch
rows = reader.rows();
batch = schema.createRowBatch();
cv = (DecimalColumnVector) batch.cols[0];
assertEquals(19, batch.size);
assertEquals(18, cv.precision);
assertEquals(4, cv.scale);
assertEquals("0.0001", cv.vector[0].toString());
assertEquals("0.001", cv.vector[1].toString());
assertEquals("0.01", cv.vector[2].toString());
assertEquals("0.1", cv.vector[3].toString());
assertEquals("1", cv.vector[4].toString());
assertEquals("10", cv.vector[5].toString());
assertEquals("100", cv.vector[6].toString());
assertEquals("1000", cv.vector[7].toString());
assertEquals("10000", cv.vector[8].toString());
assertEquals("100000", cv.vector[9].toString());
assertEquals("1000000", cv.vector[10].toString());
assertEquals("10000000", cv.vector[11].toString());
assertEquals("100000000", cv.vector[12].toString());
assertEquals("1000000000", cv.vector[13].toString());
assertEquals("10000000000", cv.vector[14].toString());
assertEquals("100000000000", cv.vector[15].toString());
assertEquals("1000000000000", cv.vector[16].toString());
assertEquals("10000000000000", cv.vector[17].toString());
assertEquals("-2", cv.vector[18].toString());
* We test union, timestamp, and decimal separately since we need to make the
* object inspector manually. (The Hive reflection-based doesn't handle
* them properly.)
public void testUnionAndTimestamp() throws Exception {
final TimeZone original = TimeZone.getDefault();
TypeDescription schema = TypeDescription.fromString(
"struct<time:timestamp," +
"union:uniontype<int,string>," +
"decimal:decimal(38,18)," +
"instant:timestamp with local time zone>"
HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 6;
setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null,
new HiveDecimalWritable("12345678.6547456"),
Timestamp.valueOf("2014-12-12 6:00:00"));
setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
1, null, "hello", new HiveDecimalWritable("-5643.234"),
Timestamp.valueOf("1996-12-11 11:00:00"));
setUnion(batch, 2, null, null, null, null, null, null);
setUnion(batch, 3, null, 0, null, null, null, null);
setUnion(batch, 4, null, 1, null, null, null, null);
setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000,
null, new HiveDecimalWritable("10000000000000000000"),
Timestamp.valueOf("2011-07-01 09:00:00"));
Random rand = new Random(42);
for(int i=1970; i < 2038; ++i) {
Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i);
HiveDecimal dec =
HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
if ((i & 1) == 0) {
setUnion(batch, batch.size++, ts, 0, i*i, null,
new HiveDecimalWritable(dec), null);
} else {
setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i),
new HiveDecimalWritable(dec), null);
if (maxValue.compareTo(dec) < 0) {
maxValue = dec;
// let's add a lot of constant rows to test the rle
batch.size = 1000;
for(int c=0; c < batch.cols.length; ++c) {
((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true;
setUnion(batch, 0, null, 0, 1732050807, null, null, null);
for(int i=0; i < 5; ++i) {
batch.size = 3;
setUnion(batch, 0, null, 0, 0, null, null, null);
setUnion(batch, 1, null, 0, 10, null, null, null);
setUnion(batch, 2, null, 0, 138, null, null, null);
// check the stats on the writer side
ColumnStatistics[] stats = writer.getStatistics();
assertEquals("1996-12-11 11:00:00.0",
((TimestampColumnStatistics) stats[6]).getMinimum().toString());
assertEquals("1996-12-11 11:00:00.0",
((TimestampColumnStatistics) stats[6]).getMinimumUTC().toString());
assertEquals("2014-12-12 06:00:00.0",
((TimestampColumnStatistics) stats[6]).getMaximum().toString());
assertEquals("2014-12-12 06:00:00.0",
((TimestampColumnStatistics) stats[6]).getMaximumUTC().toString());
Reader reader = OrcFile.createReader(testFilePath,
stats = reader.getStatistics();
// check the timestamp statistics
assertEquals("1970-01-01 00:00:00.0",
((TimestampColumnStatistics) stats[1]).getMinimum().toString());
assertEquals("1969-12-31 19:00:00.0",
((TimestampColumnStatistics) stats[1]).getMinimumUTC().toString());
assertEquals("2037-05-05 12:34:56.203",
((TimestampColumnStatistics) stats[1]).getMaximum().toString());
assertEquals("2037-05-05 08:34:56.203",
((TimestampColumnStatistics) stats[1]).getMaximumUTC().toString());
// check the instant statistics
assertEquals("1996-12-11 14:00:00.0",
((TimestampColumnStatistics) stats[6]).getMinimum().toString());
assertEquals("1996-12-11 14:00:00.0",
((TimestampColumnStatistics) stats[6]).getMinimumUTC().toString());
assertEquals("2014-12-12 09:00:00.0",
((TimestampColumnStatistics) stats[6]).getMaximum().toString());
assertEquals("2014-12-12 09:00:00.0",
((TimestampColumnStatistics) stats[6]).getMaximumUTC().toString());
schema = writer.getSchema();
assertEquals(6, schema.getMaximumId());
boolean[] expected = new boolean[] {false, false, false, false, false, false, false};
boolean[] included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] {false, true, false, false, false, true, false};
included = OrcUtils.includeColumns("time,decimal", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] {false, false, true, true, true, false, false};
included = OrcUtils.includeColumns("union", schema);
assertEquals(true, Arrays.equals(expected, included));
Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
Assert.assertEquals(5077, reader.getNumberOfRows());
DecimalColumnStatistics decStats =
(DecimalColumnStatistics) reader.getStatistics()[5];
assertEquals(71, decStats.getNumberOfValues());
assertEquals(HiveDecimal.create("-5643.234"), decStats.getMinimum());
assertEquals(maxValue, decStats.getMaximum());
// TODO: fix this
// assertEquals(null,stats.getSum());
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for(StripeInformation stripe: reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getLength();
Assert.assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
Assert.assertEquals(reader.getContentLength(), currentOffset);
RecordReader rows = reader.rows();
Assert.assertEquals(0, rows.getRowNumber());
Assert.assertEquals(0.0, rows.getProgress(), 0.000001);
schema = reader.getSchema();
batch = schema.createRowBatch(74);
Assert.assertEquals(0, rows.getRowNumber());
assertEquals(74, batch.size);
Assert.assertEquals(74, rows.getRowNumber());
TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0];
UnionColumnVector union = (UnionColumnVector) batch.cols[1];
LongColumnVector longs = (LongColumnVector) union.fields[0];
BytesColumnVector strs = (BytesColumnVector) union.fields[1];
DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2];
TimestampColumnVector instant = (TimestampColumnVector) batch.cols[3];
assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)," +
"instant:timestamp with local time zone>",
assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString());
assertEquals(0, union.tags[0]);
assertEquals(42, longs.vector[0]);
assertEquals("12345678.6547456", decs.vector[0].toString());
assertEquals("2014-12-12 09:00:00.0", instant.asScratchTimestamp(0).toString());
assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString());
assertEquals(1, union.tags[1]);
assertEquals("hello", strs.toString(1));
assertEquals("-5643.234", decs.vector[1].toString());
assertEquals("1996-12-11 14:00:00.0", instant.asScratchTimestamp(1).toString());
assertEquals(false, ts.noNulls);
assertEquals(false, union.noNulls);
assertEquals(false, decs.noNulls);
assertEquals(true, ts.isNull[2]);
assertEquals(true, union.isNull[2]);
assertEquals(true, decs.isNull[2]);
assertEquals(true, ts.isNull[3]);
assertEquals(false, union.isNull[3]);
assertEquals(0, union.tags[3]);
assertEquals(true, longs.isNull[3]);
assertEquals(true, decs.isNull[3]);
assertEquals(true, ts.isNull[4]);
assertEquals(false, union.isNull[4]);
assertEquals(1, union.tags[4]);
assertEquals(true, strs.isNull[4]);
assertEquals(true, decs.isNull[4]);
assertEquals(false, ts.isNull[5]);
assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString());
assertEquals(false, union.isNull[5]);
assertEquals(0, union.tags[5]);
assertEquals(false, longs.isNull[5]);
assertEquals(200000, longs.vector[5]);
assertEquals(false, decs.isNull[5]);
assertEquals("10000000000000000000", decs.vector[5].toString());
assertEquals("2011-07-01 12:00:00.0", instant.asScratchTimestamp(5).toString());
rand = new Random(42);
for(int i=1970; i < 2038; ++i) {
int row = 6 + i - 1970;
assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
if ((i & 1) == 0) {
assertEquals(0, union.tags[row]);
assertEquals(i*i, longs.vector[row]);
} else {
assertEquals(1, union.tags[row]);
assertEquals(Integer.toString(i * i), strs.toString(row));
assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand),
rand.nextInt(18))), decs.vector[row]);
// rebuild the row batch, so that we can read by 1000 rows
batch = schema.createRowBatch(1000);
ts = (TimestampColumnVector) batch.cols[0];
union = (UnionColumnVector) batch.cols[1];
longs = (LongColumnVector) union.fields[0];
strs = (BytesColumnVector) union.fields[1];
decs = (DecimalColumnVector) batch.cols[2];
for(int i=0; i < 5; ++i) {
assertEquals("batch " + i, 1000, batch.size);
assertEquals("batch " + i, false, union.isRepeating);
assertEquals("batch " + i, true, union.noNulls);
for(int r=0; r < batch.size; ++r) {
assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]);
assertEquals("batch " + i, true, longs.isRepeating);
assertEquals("batch " + i, 1732050807, longs.vector[0]);
assertEquals(3, batch.size);
assertEquals(0, union.tags[0]);
assertEquals(0, longs.vector[0]);
assertEquals(0, union.tags[1]);
assertEquals(10, longs.vector[1]);
assertEquals(0, union.tags[2]);
assertEquals(138, longs.vector[2]);
assertEquals(0, batch.size);
Assert.assertEquals(1.0, rows.getProgress(), 0.00001);
Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
assertEquals(1000, batch.size);
assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0));
assertEquals(1, union.tags[0]);
assertEquals("hello", strs.toString(0));
assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]);
* Read and write a randomly generated snappy file.
* @throws Exception
public void testSnappy() throws Exception {
TypeDescription schema = createInnerSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random rand;
writeRandomIntBytesBatches(writer, batch, 10, 1000);
Reader reader = OrcFile.createReader(testFilePath,
Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1000);
rand = new Random(12);
LongColumnVector longs = (LongColumnVector) batch.cols[0];
BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
for(int b=0; b < 10; ++b) {
assertEquals(1000, batch.size);
for(int r=0; r < batch.size; ++r) {
assertEquals(rand.nextInt(), longs.vector[r]);
assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r));
assertEquals(0, batch.size);
* Read and write a randomly generated lzo file.
* @throws Exception
public void testLzo() throws Exception {
TypeDescription schema =
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(69);
batch.size = 1000;
for(int b=0; b < 10; ++b) {
for (int r=0; r < 1000; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r;
((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong();
Reader reader = OrcFile.createReader(testFilePath,
assertEquals(CompressionKind.LZO, reader.getCompressionKind());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1000);
rand = new Random(69);
for(int b=0; b < 10; ++b) {
assertEquals(1000, batch.size);
for(int r=0; r < batch.size; ++r) {
((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals(b * 1000 + r,
((LongColumnVector) batch.cols[1]).vector[r]);
((LongColumnVector) batch.cols[2]).vector[r]);
assertEquals(0, batch.size);
* Read and write a randomly generated lz4 file.
* @throws Exception
public void testLz4() throws Exception {
TypeDescription schema =
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(3);
batch.size = 1000;
for(int b=0; b < 10; ++b) {
for (int r=0; r < 1000; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r;
((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong();
Reader reader = OrcFile.createReader(testFilePath,
assertEquals(CompressionKind.LZ4, reader.getCompressionKind());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1000);
rand = new Random(3);
for(int b=0; b < 10; ++b) {
assertEquals(1000, batch.size);
for(int r=0; r < batch.size; ++r) {
((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals(b * 1000 + r,
((LongColumnVector) batch.cols[1]).vector[r]);
((LongColumnVector) batch.cols[2]).vector[r]);
assertEquals(0, batch.size);
* Read and write a randomly generated zstd file.
public void testZstd() throws Exception {
TypeDescription schema =
try (Writer writer = OrcFile.createWriter(testFilePath,
.version(fileFormat))) {
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(3);
batch.size = 1000;
for (int b = 0; b < 10; ++b) {
for (int r = 0; r < 1000; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r;
((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong();
try (Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows()) {
assertEquals(CompressionKind.ZSTD, reader.getCompressionKind());
VectorizedRowBatch batch = reader.getSchema().createRowBatch(1000);
Random rand = new Random(3);
for (int b = 0; b < 10; ++b) {
assertEquals(1000, batch.size);
for (int r = 0; r < batch.size; ++r) {
((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals(b * 1000 + r,
((LongColumnVector) batch.cols[1]).vector[r]);
((LongColumnVector) batch.cols[2]).vector[r]);
assertEquals(0, batch.size);
* Read and write a file; verify codec usage.
* @throws Exception
public void testCodecPool() throws Exception {
TypeDescription schema = createInnerSchema();
VectorizedRowBatch batch = schema.createRowBatch();
WriterOptions opts = OrcFile.writerOptions(conf)
CompressionCodec snappyCodec, zlibCodec;
snappyCodec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.SNAPPY), batch);
assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY));
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
CompressionCodec codec = readBatchesAndGetCodec(reader, 10, 1000);
assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY));
assertSame(snappyCodec, codec);
reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
codec = readBatchesAndGetCodec(reader, 10, 1000);
assertSame(snappyCodec, codec);
assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY));
zlibCodec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.ZLIB), batch);
assertNotSame(snappyCodec, zlibCodec);
assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.ZLIB));
codec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.ZLIB), batch);
assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.ZLIB));
assertSame(zlibCodec, codec);
assertSame(snappyCodec, OrcCodecPool.getCodec(CompressionKind.SNAPPY));
CompressionCodec snappyCodec2 = writeBatchesAndGetCodec(
10, 1000, opts.compress(CompressionKind.SNAPPY), batch);
assertNotSame(snappyCodec, snappyCodec2);
OrcCodecPool.returnCodec(CompressionKind.SNAPPY, snappyCodec);
reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind());
codec = readBatchesAndGetCodec(reader, 10, 1000);
assertEquals(2, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY));
assertTrue(snappyCodec == codec || snappyCodec2 == codec);
private CompressionCodec writeBatchesAndGetCodec(int count,
int size,
WriterOptions opts,
VectorizedRowBatch batch
) throws IOException {
fs.delete(testFilePath, false);
Writer writer = OrcFile.createWriter(testFilePath, opts);
CompressionCodec codec = ((WriterImpl) writer).getCompressionCodec();
writeRandomIntBytesBatches(writer, batch, count, size);
return codec;
private CompressionCodec readBatchesAndGetCodec(
Reader reader, int count, int size) throws IOException {
RecordReader rows = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch(size);
for (int b = 0; b < count; ++b) {
CompressionCodec codec = ((RecordReaderImpl)rows).getCompressionCodec();
return codec;
private void readRandomBatches(
Reader reader, RecordReader rows, int count, int size) throws IOException {
private void writeRandomIntBytesBatches(
Writer writer, VectorizedRowBatch batch, int count, int size) throws IOException {
Random rand = new Random(12);
batch.size = size;
for(int b=0; b < count; ++b) {
for (int r=0; r < size; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt();
((BytesColumnVector) batch.cols[1]).setVal(r,
* Read and write a randomly generated snappy file.
* @throws Exception
public void testWithoutIndex() throws Exception {
TypeDescription schema = createInnerSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(24);
batch.size = 5;
for(int c=0; c < batch.cols.length; ++c) {
for(int i=0; i < 10000; ++i) {
((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt();
((BytesColumnVector) batch.cols[1])
.setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes(StandardCharsets.UTF_8));
Reader reader = OrcFile.createReader(testFilePath,
Assert.assertEquals(50000, reader.getNumberOfRows());
Assert.assertEquals(0, reader.getRowIndexStride());
StripeInformation stripe = reader.getStripes().iterator().next();
assertEquals(true, stripe.getDataLength() != 0);
assertEquals(0, stripe.getIndexLength());
RecordReader rows = reader.rows();
rand = new Random(24);
batch = reader.getSchema().createRowBatch(1000);
LongColumnVector longs = (LongColumnVector) batch.cols[0];
BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
for(int i=0; i < 50; ++i) {
assertEquals("batch " + i, 1000, batch.size);
for(int j=0; j < 200; ++j) {
int intVal = rand.nextInt();
String strVal = Integer.toBinaryString(rand.nextInt());
for (int k = 0; k < 5; ++k) {
assertEquals(intVal, longs.vector[j * 5 + k]);
assertEquals(strVal, strs.toString(j * 5 + k));
assertEquals(0, batch.size);
public void testSeek() throws Exception {
TypeDescription schema = createBigRowSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
Random rand = new Random(42);
final int COUNT=32768;
long[] intValues= new long[COUNT];
double[] doubleValues = new double[COUNT];
String[] stringValues = new String[COUNT];
BytesWritable[] byteValues = new BytesWritable[COUNT];
String[] words = new String[128];
for(int i=0; i < words.length; ++i) {
words[i] = Integer.toHexString(rand.nextInt());
for(int i=0; i < COUNT/2; ++i) {
intValues[2*i] = rand.nextLong();
intValues[2*i+1] = intValues[2*i];
stringValues[2*i] = words[rand.nextInt(words.length)];
stringValues[2*i+1] = stringValues[2*i];
for(int i=0; i < COUNT; ++i) {
doubleValues[i] = rand.nextDouble();
byte[] buf = new byte[20];
byteValues[i] = new BytesWritable(buf);
for(int i=0; i < COUNT; ++i) {
appendRandomRow(batch, intValues, doubleValues, stringValues,
byteValues, words, i);
if (batch.size == 1024) {
if (batch.size != 0) {
Reader reader = OrcFile.createReader(testFilePath,
Assert.assertEquals(COUNT, reader.getNumberOfRows());
RecordReader rows = reader.rows();
// get the row index
InStream.StreamOptions options = InStream.options();
if (reader.getCompressionKind() != CompressionKind.NONE) {
DataReader meta = RecordReaderUtils.createDefaultDataReader(
StripePlanner planner = new StripePlanner(schema, new ReaderEncryption(),
meta, reader.getWriterVersion(), true, Integer.MAX_VALUE);
boolean[] columns = new boolean[schema.getMaximumId() + 1];
Arrays.fill(columns, true);
OrcIndex index = planner.parseStripe(reader.getStripes().get(0), columns)
.readRowIndex(null, null);
// check the primitive columns to make sure they have the right number of
// items in the first row group
for(int c=1; c < 9; ++c) {
OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c];
batch = reader.getSchema().createRowBatch();
int nextRowInBatch = -1;
for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) {
// if we have consumed the previous batch read a new one
if (nextRowInBatch < 0) {
long base = Math.max(i - 1023, 0);
Assert.assertEquals("row " + i, true, rows.nextBatch(batch));
nextRowInBatch = batch.size - 1;
checkRandomRow(batch, intValues, doubleValues,
stringValues, byteValues, words, i, nextRowInBatch);
Iterator<StripeInformation> stripeIterator =
long offsetOfStripe2 = 0;
long offsetOfStripe4 = 0;
long lastRowOfStripe2 = 0;
for(int i = 0; i < 5; ++i) {
StripeInformation stripe =;
if (i < 2) {
lastRowOfStripe2 += stripe.getNumberOfRows();
} else if (i == 2) {
offsetOfStripe2 = stripe.getOffset();
lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
} else if (i == 4) {
offsetOfStripe4 = stripe.getOffset();
Arrays.fill(columns, false);
columns[5] = true; // long colulmn
columns[9] = true; // text column
rows = reader.rows(reader.options()
.range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2)
// we only want two rows
batch = reader.getSchema().createRowBatch(2);
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1, batch.size);
assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0));
assertEquals(stringValues[(int) lastRowOfStripe2],
getText(batch, 0).toString());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0));
assertEquals(stringValues[(int) lastRowOfStripe2 + 1],
getText(batch, 0).toString());
private void appendRandomRow(VectorizedRowBatch batch,
long[] intValues, double[] doubleValues,
String[] stringValues,
BytesWritable[] byteValues,
String[] words, int i) {
InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
words[i % words.length] + "-x");
setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i],
(short) intValues[i], (int) intValues[i], intValues[i],
(float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
new MiddleStruct(inner, inner2), list(), map(inner, inner2));
private void checkRandomRow(VectorizedRowBatch batch,
long[] intValues, double[] doubleValues,
String[] stringValues,
BytesWritable[] byteValues,
String[] words, int i, int rowInBatch) {
InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
words[i % words.length] + "-x");
checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i],
(short) intValues[i], (int) intValues[i], intValues[i],
(float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i],
new MiddleStruct(inner, inner2), list(), map(inner, inner2));
public void testMemoryManagement() throws Exception {
OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 100);
final long POOL_SIZE = 50_000;
TypeDescription schema = createInnerSchema();
MemoryManagerImpl memoryMgr = new MemoryManagerImpl(POOL_SIZE);
// set up 10 files that all request the full size.
MemoryManager.Callback ignore = newScale -> false;
for(int f=0; f < 9; ++f) {
memoryMgr.addWriter(new Path("file-" + f), POOL_SIZE, ignore);
Writer writer = OrcFile.createWriter(testFilePath,
// check to make sure it is 10%
assertEquals(0.1, memoryMgr.getAllocationScale(), 0.001);
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1;
for(int i=0; i < 2500; ++i) {
((LongColumnVector) batch.cols[0]).vector[0] = i * 300;
((BytesColumnVector) batch.cols[1]).setVal(0,
assertEquals(0.111, memoryMgr.getAllocationScale(), 0.001);
Reader reader = OrcFile.createReader(testFilePath,
int i = 0;
for(StripeInformation stripe: reader.getStripes()) {
i += 1;
assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
stripe.getDataLength() < POOL_SIZE);
// 0.11 always uses the dictionary, so ends up with a lot more stripes
assertEquals(fileFormat == OrcFile.Version.V_0_11 ? 25 : 3, i);
assertEquals(2500, reader.getNumberOfRows());
public void testPredicatePushdown() throws Exception {
TypeDescription schema = createInnerSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 3500;
for(int i=0; i < 3500; ++i) {
((LongColumnVector) batch.cols[0]).vector[i] = i * 300;
((BytesColumnVector) batch.cols[1]).setVal(i,
Reader reader = OrcFile.createReader(testFilePath,
assertEquals(3500, reader.getNumberOfRows());
SearchArgument sarg = SearchArgumentFactory.newBuilder()
.lessThan("int1", PredicateLeaf.Type.LONG, 300000L)
.lessThan("int1", PredicateLeaf.Type.LONG, 600000L)
RecordReader rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "string1"}));
batch = reader.getSchema().createRowBatch(2000);
LongColumnVector ints = (LongColumnVector) batch.cols[0];
BytesColumnVector strs = (BytesColumnVector) batch.cols[1];
Assert.assertEquals(1000L, rows.getRowNumber());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
for(int i=1000; i < 2000; ++i) {
assertEquals(300 * i, ints.vector[i - 1000]);
assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
Assert.assertEquals(false, rows.nextBatch(batch));
Assert.assertEquals(3500, rows.getRowNumber());
// look through the file with no rows selected
sarg = SearchArgumentFactory.newBuilder()
.lessThan("int1", PredicateLeaf.Type.LONG, 0L)
rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "string1"}));
Assert.assertEquals(3500L, rows.getRowNumber());
// select first 100 and last 100 rows
sarg = SearchArgumentFactory.newBuilder()
.lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100)
.lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400)
rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "string1"}));
Assert.assertEquals(0, rows.getRowNumber());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
Assert.assertEquals(3000, rows.getRowNumber());
for(int i=0; i < 1000; ++i) {
assertEquals(300 * i, ints.vector[i]);
assertEquals(Integer.toHexString(10*i), strs.toString(i));
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(500, batch.size);
Assert.assertEquals(3500, rows.getRowNumber());
for(int i=3000; i < 3500; ++i) {
assertEquals(300 * i, ints.vector[i - 3000]);
assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
Assert.assertEquals(false, rows.nextBatch(batch));
Assert.assertEquals(3500, rows.getRowNumber());
* Test all of the types that have distinct ORC writers using the vectorized
* writer with different combinations of repeating and null values.
* @throws Exception
public void testRepeating() throws Exception {
// create a row type with each type that has a unique writer
// really just folds short, int, and long together
TypeDescription schema = TypeDescription.createStruct()
.addField("bin", TypeDescription.createBinary())
.addField("bool", TypeDescription.createBoolean())
.addField("byte", TypeDescription.createByte())
.addField("long", TypeDescription.createLong())
.addField("float", TypeDescription.createFloat())
.addField("double", TypeDescription.createDouble())
.addField("date", TypeDescription.createDate())
.addField("time", TypeDescription.createTimestamp())
.addField("dec", TypeDescription.createDecimal()
.addField("string", TypeDescription.createString())
.addField("char", TypeDescription.createChar().withMaxLength(10))
.addField("vc", TypeDescription.createVarchar().withMaxLength(10))
.addField("struct", TypeDescription.createStruct()
.addField("sub1", TypeDescription.createInt()))
.addField("union", TypeDescription.createUnion()
.addField("list", TypeDescription
VectorizedRowBatch batch = schema.createRowBatch();
Writer writer = OrcFile.createWriter(testFilePath,
// write 1024 repeating nulls
batch.size = 1024;
for(int c = 0; c < batch.cols.length; ++c) {
batch.cols[c].noNulls = false;
batch.cols[c].isNull[0] = true;
// write 1024 repeating non-null
for(int c =0; c < batch.cols.length; ++c) {
batch.cols[c].isNull[0] = false;
((BytesColumnVector) batch.cols[0]).setVal(0, "Horton".getBytes(StandardCharsets.UTF_8));
((LongColumnVector) batch.cols[1]).vector[0] = 1;
((LongColumnVector) batch.cols[2]).vector[0] = 130;
((LongColumnVector) batch.cols[3]).vector[0] = 0x123456789abcdef0L;
((DoubleColumnVector) batch.cols[4]).vector[0] = 1.125;
((DoubleColumnVector) batch.cols[5]).vector[0] = 0.0009765625;
((LongColumnVector) batch.cols[6]).vector[0] =
new DateWritable(new Date(111, 6, 1)).getDays();
((TimestampColumnVector) batch.cols[7]).set(0,
new Timestamp(115, 9, 23, 10, 11, 59,
((DecimalColumnVector) batch.cols[8]).vector[0] =
new HiveDecimalWritable("1.234567");
((BytesColumnVector) batch.cols[9]).setVal(0, "Echelon".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[10]).setVal(0, "Juggernaut".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[11]).setVal(0, "Dreadnaught".getBytes(StandardCharsets.UTF_8));
((LongColumnVector) ((StructColumnVector) batch.cols[12]).fields[0])
.vector[0] = 123;
((UnionColumnVector) batch.cols[13]).tags[0] = 1;
((LongColumnVector) ((UnionColumnVector) batch.cols[13]).fields[1])
.vector[0] = 1234;
((ListColumnVector) batch.cols[14]).offsets[0] = 0;
((ListColumnVector) batch.cols[14]).lengths[0] = 3;
((ListColumnVector) batch.cols[14]).child.isRepeating = true;
((LongColumnVector) ((ListColumnVector) batch.cols[14]).child).vector[0]
= 31415;
((MapColumnVector) batch.cols[15]).offsets[0] = 0;
((MapColumnVector) batch.cols[15]).lengths[0] = 3;
((MapColumnVector) batch.cols[15]).values.isRepeating = true;
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys)
.setVal(0, "ORC".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys)
.setVal(1, "Hive".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys)
.setVal(2, "LLAP".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).values)
.setVal(0, "fast".getBytes(StandardCharsets.UTF_8));
// write 1024 null without repeat
for(int c = 0; c < batch.cols.length; ++c) {
batch.cols[c].noNulls = false;
Arrays.fill(batch.cols[c].isNull, true);
// add 1024 rows of non-null, non-repeating
batch.size = 1024;
((ListColumnVector) batch.cols[14]).child.ensureSize(3 * 1024, false);
((MapColumnVector) batch.cols[15]).keys.ensureSize(3 * 1024, false);
((MapColumnVector) batch.cols[15]).values.ensureSize(3 * 1024, false);
for(int r=0; r < 1024; ++r) {
((BytesColumnVector) batch.cols[0]).setVal(r,
((LongColumnVector) batch.cols[1]).vector[r] = r % 2;
((LongColumnVector) batch.cols[2]).vector[r] = (r % 255);
((LongColumnVector) batch.cols[3]).vector[r] = 31415L * r;
((DoubleColumnVector) batch.cols[4]).vector[r] = 1.125 * r;
((DoubleColumnVector) batch.cols[5]).vector[r] = 0.0009765625 * r;
((LongColumnVector) batch.cols[6]).vector[r] =
new DateWritable(new Date(111, 6, 1)).getDays() + r;
Timestamp ts = new Timestamp(115, 9, 25, 10, 11, 59 + r, 999999999);
((TimestampColumnVector) batch.cols[7]).set(r, ts);
((DecimalColumnVector) batch.cols[8]).vector[r] =
new HiveDecimalWritable("1.234567");
((BytesColumnVector) batch.cols[9]).setVal(r,
((BytesColumnVector) batch.cols[10]).setVal(r,
((BytesColumnVector) batch.cols[11]).setVal(r,
Integer.toHexString(r * 128).getBytes(StandardCharsets.UTF_8));
((LongColumnVector) ((StructColumnVector) batch.cols[12]).fields[0])
.vector[r] = r + 13;
((UnionColumnVector) batch.cols[13]).tags[r] = 1;
((LongColumnVector) ((UnionColumnVector) batch.cols[13]).fields[1])
.vector[r] = r + 42;
((ListColumnVector) batch.cols[14]).offsets[r] = 3 * r;
((ListColumnVector) batch.cols[14]).lengths[r] = 3;
for(int i=0; i < 3; ++i) {
((LongColumnVector) ((ListColumnVector) batch.cols[14]).child)
.vector[3 * r + i] = 31415 + i;
((MapColumnVector) batch.cols[15]).offsets[r] = 3 * r;
((MapColumnVector) batch.cols[15]).lengths[r] = 3;
for(int i=0; i < 3; ++i) {
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys)
.setVal(3 * r + i, Integer.toHexString(3 * r + i).getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) ((MapColumnVector) batch.cols[15]).values)
.setVal(3 * r + i, Integer.toString(3 * r + i).getBytes(StandardCharsets.UTF_8));
Reader reader = OrcFile.createReader(testFilePath,
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertArrayEquals(stats, writer.getStatistics());
assertEquals(4096, stats[0].getNumberOfValues());
assertEquals(false, stats[0].hasNull());
for(TypeDescription colType: schema.getChildren()) {
assertEquals("count on " + colType.getId(),
2048, stats[colType.getId()].getNumberOfValues());
assertEquals("hasNull on " + colType.getId(),
true, stats[colType.getId()].hasNull());
assertEquals(8944, ((BinaryColumnStatistics) stats[1]).getSum());
assertEquals(1536, ((BooleanColumnStatistics) stats[2]).getTrueCount());
assertEquals(512, ((BooleanColumnStatistics) stats[2]).getFalseCount());
assertEquals(false, ((IntegerColumnStatistics) stats[4]).isSumDefined());
assertEquals(0, ((IntegerColumnStatistics) stats[4]).getMinimum());
((IntegerColumnStatistics) stats[4]).getMaximum());
assertEquals("0", ((StringColumnStatistics) stats[10]).getMinimum());
assertEquals("Echelon", ((StringColumnStatistics) stats[10]).getMaximum());
assertEquals(10154, ((StringColumnStatistics) stats[10]).getSum());
assertEquals("0 ",
((StringColumnStatistics) stats[11]).getMinimum());
assertEquals("ff ",
((StringColumnStatistics) stats[11]).getMaximum());
assertEquals(20480, ((StringColumnStatistics) stats[11]).getSum());
((StringColumnStatistics) stats[12]).getMinimum());
((StringColumnStatistics) stats[12]).getMaximum());
assertEquals(14813, ((StringColumnStatistics) stats[12]).getSum());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1024);
BytesColumnVector bins = (BytesColumnVector) batch.cols[0];
LongColumnVector bools = (LongColumnVector) batch.cols[1];
LongColumnVector bytes = (LongColumnVector) batch.cols[2];
LongColumnVector longs = (LongColumnVector) batch.cols[3];
DoubleColumnVector floats = (DoubleColumnVector) batch.cols[4];
DoubleColumnVector doubles = (DoubleColumnVector) batch.cols[5];
LongColumnVector dates = (LongColumnVector) batch.cols[6];
TimestampColumnVector times = (TimestampColumnVector) batch.cols[7];
DecimalColumnVector decs = (DecimalColumnVector) batch.cols[8];
BytesColumnVector strs = (BytesColumnVector) batch.cols[9];
BytesColumnVector chars = (BytesColumnVector) batch.cols[10];
BytesColumnVector vcs = (BytesColumnVector) batch.cols[11];
StructColumnVector structs = (StructColumnVector) batch.cols[12];
UnionColumnVector unions = (UnionColumnVector) batch.cols[13];
ListColumnVector lists = (ListColumnVector) batch.cols[14];
MapColumnVector maps = (MapColumnVector) batch.cols[15];
LongColumnVector structInts = (LongColumnVector) structs.fields[0];
LongColumnVector unionInts = (LongColumnVector) unions.fields[1];
LongColumnVector listInts = (LongColumnVector) lists.child;
BytesColumnVector mapKeys = (BytesColumnVector) maps.keys;
BytesColumnVector mapValues = (BytesColumnVector) maps.values;
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
// read the 1024 nulls
for(int f=0; f < batch.cols.length; ++f) {
assertEquals("field " + f,
true, batch.cols[f].isRepeating);
assertEquals("field " + f,
false, batch.cols[f].noNulls);
assertEquals("field " + f,
true, batch.cols[f].isNull[0]);
// read the 1024 repeat values
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
assertEquals("row " + r, "Horton", bins.toString(r));
assertEquals("row " + r, 1, bools.vector[r]);
assertEquals("row " + r, -126, bytes.vector[r]);
assertEquals("row " + r, 1311768467463790320L, longs.vector[r]);
assertEquals("row " + r, 1.125, floats.vector[r], 0.00001);
assertEquals("row " + r, 9.765625E-4, doubles.vector[r], 0.000001);
assertEquals("row " + r, "2011-07-01",
new DateWritable((int) dates.vector[r]).toString());
assertEquals("row " + r, "2015-10-23 10:11:59.999999999",
assertEquals("row " + r, "1.234567", decs.vector[r].toString());
assertEquals("row " + r, "Echelon", strs.toString(r));
assertEquals("row " + r, "Juggernaut", chars.toString(r));
assertEquals("row " + r, "Dreadnaugh", vcs.toString(r));
assertEquals("row " + r, 123, structInts.vector[r]);
assertEquals("row " + r, 1, unions.tags[r]);
assertEquals("row " + r, 1234, unionInts.vector[r]);
assertEquals("row " + r, 3, lists.lengths[r]);
assertEquals("row " + r, true, listInts.isRepeating);
assertEquals("row " + r, 31415, listInts.vector[0]);
assertEquals("row " + r, 3, maps.lengths[r]);
assertEquals("row " + r, "ORC", mapKeys.toString((int) maps.offsets[r]));
assertEquals("row " + r, "Hive", mapKeys.toString((int) maps.offsets[r] + 1));
assertEquals("row " + r, "LLAP", mapKeys.toString((int) maps.offsets[r] + 2));
assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r]));
assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r] + 1));
assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r] + 2));
// read the second set of 1024 nulls
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int f=0; f < batch.cols.length; ++f) {
assertEquals("field " + f,
true, batch.cols[f].isRepeating);
assertEquals("field " + f,
false, batch.cols[f].noNulls);
assertEquals("field " + f,
true, batch.cols[f].isNull[0]);
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
String hex = Integer.toHexString(r);
assertEquals("row " + r, hex, bins.toString(r));
assertEquals("row " + r, r % 2 == 1 ? 1 : 0, bools.vector[r]);
assertEquals("row " + r, (byte) (r % 255), bytes.vector[r]);
assertEquals("row " + r, 31415L * r, longs.vector[r]);
assertEquals("row " + r, 1.125F * r, floats.vector[r], 0.0001);
assertEquals("row " + r, 0.0009765625 * r, doubles.vector[r], 0.000001);
assertEquals("row " + r, new DateWritable(new Date(111, 6, 1 + r)),
new DateWritable((int) dates.vector[r]));
assertEquals("row " + r,
new Timestamp(115, 9, 25, 10, 11, 59 + r, 999999999),
assertEquals("row " + r, "1.234567", decs.vector[r].toString());
assertEquals("row " + r, Integer.toString(r), strs.toString(r));
assertEquals("row " + r, Integer.toHexString(r), chars.toString(r));
assertEquals("row " + r, Integer.toHexString(r * 128), vcs.toString(r));
assertEquals("row " + r, r + 13, structInts.vector[r]);
assertEquals("row " + r, 1, unions.tags[r]);
assertEquals("row " + r, r + 42, unionInts.vector[r]);
assertEquals("row " + r, 3, lists.lengths[r]);
assertEquals("row " + r, 31415, listInts.vector[(int) lists.offsets[r]]);
assertEquals("row " + r, 31416, listInts.vector[(int) lists.offsets[r] + 1]);
assertEquals("row " + r, 31417, listInts.vector[(int) lists.offsets[r] + 2]);
assertEquals("row " + r, 3, maps.lengths[3]);
assertEquals("row " + r, Integer.toHexString(3 * r), mapKeys.toString((int) maps.offsets[r]));
assertEquals("row " + r, Integer.toString(3 * r), mapValues.toString((int) maps.offsets[r]));
assertEquals("row " + r, Integer.toHexString(3 * r + 1), mapKeys.toString((int) maps.offsets[r] + 1));
assertEquals("row " + r, Integer.toString(3 * r + 1), mapValues.toString((int) maps.offsets[r] + 1));
assertEquals("row " + r, Integer.toHexString(3 * r + 2), mapKeys.toString((int) maps.offsets[r] + 2));
assertEquals("row " + r, Integer.toString(3 * r + 2), mapValues.toString((int) maps.offsets[r] + 2));
// should have no more rows
Assert.assertEquals(false, rows.nextBatch(batch));
private static String makeString(BytesColumnVector vector, int row) {
if (vector.isRepeating) {
row = 0;
if (vector.noNulls || !vector.isNull[row]) {
return new String(vector.vector[row], vector.start[row],
vector.length[row], StandardCharsets.UTF_8);
} else {
return null;
* Test the char and varchar padding and truncation.
* @throws Exception
public void testStringPadding() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
.addField("char", TypeDescription.createChar().withMaxLength(10))
.addField("varchar", TypeDescription.createVarchar().withMaxLength(10));
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 4;
for(int c=0; c < batch.cols.length; ++c) {
((BytesColumnVector) batch.cols[c]).setVal(0, "".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[c]).setVal(1, "xyz".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[c]).setVal(2, "0123456789".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[c]).setVal(3,
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(4, batch.size);
// ORC currently trims the output strings. See HIVE-12286
makeString((BytesColumnVector) batch.cols[0], 0));
makeString((BytesColumnVector) batch.cols[0], 1));
makeString((BytesColumnVector) batch.cols[0], 2));
makeString((BytesColumnVector) batch.cols[0], 3));
makeString((BytesColumnVector) batch.cols[1], 0));
makeString((BytesColumnVector) batch.cols[1], 1));
makeString((BytesColumnVector) batch.cols[1], 2));
makeString((BytesColumnVector) batch.cols[1], 3));
* A test case that tests the case where you add a repeating batch
* to a column that isn't using dictionary encoding.
* @throws Exception
public void testNonDictionaryRepeatingString() throws Exception {
Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
TypeDescription schema = TypeDescription.createStruct()
.addField("str", TypeDescription.createString());
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
for(int r=0; r < batch.size; ++r) {
((BytesColumnVector) batch.cols[0]).setVal(r,
Integer.toString(r * 10001).getBytes(StandardCharsets.UTF_8));
batch.cols[0].isRepeating = true;
((BytesColumnVector) batch.cols[0]).setVal(0, "Halloween".getBytes(StandardCharsets.UTF_8));
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
assertEquals(Integer.toString(r * 10001),
makeString((BytesColumnVector) batch.cols[0], r));
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
makeString((BytesColumnVector) batch.cols[0], r));
Assert.assertEquals(false, rows.nextBatch(batch));
public void testStructs() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
.addField("struct", TypeDescription.createStruct()
.addField("inner", TypeDescription.createLong()));
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
StructColumnVector outer = (StructColumnVector) batch.cols[0];
outer.noNulls = false;
for(int r=0; r < 1024; ++r) {
if (r < 200 || (r >= 400 && r < 600) || r >= 800) {
outer.isNull[r] = true;
((LongColumnVector) outer.fields[0]).vector[r] = r;
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
assertEquals(1024, batch.size);
StructColumnVector inner = (StructColumnVector) batch.cols[0];
LongColumnVector vec = (LongColumnVector) inner.fields[0];
for(int r=0; r < 1024; ++r) {
if (r < 200 || (r >= 400 && r < 600) || r >= 800) {
assertEquals("row " + r, true, inner.isNull[r]);
} else {
assertEquals("row " + r, false, inner.isNull[r]);
assertEquals("row " + r, r, vec.vector[r]);
assertEquals(0, batch.size);
* Test Unions.
* @throws Exception
public void testUnions() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
.addField("outer", TypeDescription.createUnion()
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
UnionColumnVector outer = (UnionColumnVector) batch.cols[0];
batch.cols[0].noNulls = false;
for(int r=0; r < 1024; ++r) {
if (r < 200) {
outer.isNull[r] = true;
} else if (r < 300) {
outer.tags[r] = 0;
} else if (r < 400) {
outer.tags[r] = 1;
} else if (r < 600) {
outer.isNull[r] = true;
} else if (r < 800) {
outer.tags[r] = 1;
} else if (r < 1000) {
outer.isNull[r] = true;
} else {
outer.tags[r] = 1;
((LongColumnVector) outer.fields[0]).vector[r] = r;
((LongColumnVector) outer.fields[1]).vector[r] = -r;
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1024);
UnionColumnVector union = (UnionColumnVector) batch.cols[0];
LongColumnVector ints = (LongColumnVector) union.fields[0];
LongColumnVector longs = (LongColumnVector) union.fields[1];
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
if (r < 200) {
assertEquals("row " + r, true, union.isNull[r]);
} else if (r < 300) {
assertEquals("row " + r, false, union.isNull[r]);
assertEquals("row " + r, 0, union.tags[r]);
assertEquals("row " + r, r, ints.vector[r]);
} else if (r < 400) {
assertEquals("row " + r, false, union.isNull[r]);
assertEquals("row " + r, 1, union.tags[r]);
assertEquals("row " + r, -r, longs.vector[r]);
} else if (r < 600) {
assertEquals("row " + r, true, union.isNull[r]);
} else if (r < 800) {
assertEquals("row " + r, false, union.isNull[r]);
assertEquals("row " + r, 1, union.tags[r]);
assertEquals("row " + r, -r, longs.vector[r]);
} else if (r < 1000) {
assertEquals("row " + r, true, union.isNull[r]);
} else {
assertEquals("row " + r, false, union.isNull[r]);
assertEquals("row " + r, 1, union.tags[r]);
assertEquals("row " + r, -r, longs.vector[r]);
Assert.assertEquals(false, rows.nextBatch(batch));
* Test lists and how they interact with the child column. In particular,
* put nulls between back to back lists and then make some lists that
* oper lap.
* @throws Exception
public void testLists() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
ListColumnVector list = (ListColumnVector) batch.cols[0];
list.noNulls = false;
for(int r=0; r < 1024; ++r) {
if (r < 200) {
list.isNull[r] = true;
} else if (r < 300) {
list.offsets[r] = r - 200;
list.lengths[r] = 1;
} else if (r < 400) {
list.isNull[r] = true;
} else if (r < 500) {
list.offsets[r] = r - 300;
list.lengths[r] = 1;
} else if (r < 600) {
list.isNull[r] = true;
} else if (r < 700) {
list.offsets[r] = r;
list.lengths[r] = 2;
} else {
list.isNull[r] = true;
((LongColumnVector) list.child).vector[r] = r * 10;
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch(1024);
list = (ListColumnVector) batch.cols[0];
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
StringBuilder actual = new StringBuilder();
list.stringifyValue(actual, r);
if (r < 200) {
assertEquals("row " + r, "null", actual.toString());
} else if (r < 300) {
assertEquals("row " + r, "[" + ((r - 200) * 10) + "]",
} else if (r < 400) {
assertEquals("row " + r, "null", actual.toString());
} else if (r < 500) {
assertEquals("row " + r, "[" + ((r - 300) * 10) + "]",
} else if (r < 600) {
assertEquals("row " + r, "null", actual.toString());
} else if (r < 700) {
assertEquals("row " + r, "[" + (10 * r) + ", " + (10 * (r + 1)) + "]",
} else {
assertEquals("row " + r, "null", actual.toString());
Assert.assertEquals(false, rows.nextBatch(batch));
* Test maps and how they interact with the child column. In particular,
* put nulls between back to back lists and then make some lists that
* oper lap.
* @throws Exception
public void testMaps() throws Exception {
TypeDescription schema = TypeDescription.createStruct()
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
MapColumnVector map = (MapColumnVector) batch.cols[0];
map.noNulls = false;
for(int r=0; r < 1024; ++r) {
if (r < 200) {
map.isNull[r] = true;
} else if (r < 300) {
map.offsets[r] = r - 200;
map.lengths[r] = 1;
} else if (r < 400) {
map.isNull[r] = true;
} else if (r < 500) {
map.offsets[r] = r - 300;
map.lengths[r] = 1;
} else if (r < 600) {
map.isNull[r] = true;
} else if (r < 700) {
map.offsets[r] = r;
map.lengths[r] = 2;
} else {
map.isNull[r] = true;
((LongColumnVector) map.keys).vector[r] = r;
((LongColumnVector) map.values).vector[r] = r * 10;
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
map = (MapColumnVector) batch.cols[0];
assertEquals(1024, batch.size);
for(int r=0; r < 1024; ++r) {
StringBuilder buffer = new StringBuilder();
map.stringifyValue(buffer, r);
String actual = buffer.toString();
if (r < 200) {
assertEquals("row " + r, "null", actual);
} else if (r < 300) {
assertEquals("row " + r, "[{\"key\": " + (r - 200) +
", \"value\": " + ((r - 200) * 10) + "}]",
} else if (r < 400) {
assertEquals("row " + r, "null", actual);
} else if (r < 500) {
assertEquals("row " + r, "[{\"key\": " + (r - 300) +
", \"value\": " + ((r - 300) * 10) + "}]", actual);
} else if (r < 600) {
assertEquals("row " + r, "null", actual);
} else if (r < 700) {
assertEquals("row " + r, "[{\"key\": " + r + ", \"value\": " + (r * 10)
+ "}, {\"key\": " + (r + 1) + ", \"value\": " + (10 * (r + 1))
+ "}]", actual);
} else {
assertEquals("row " + r, "null", actual);
assertEquals(0, batch.size);
public void testExpansion() throws Exception {
TypeDescription schema =
"struct<list1:array<string>," +
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 2;
ListColumnVector list1 = (ListColumnVector) batch.cols[0];
BytesColumnVector str = (BytesColumnVector) list1.child;
str.ensureSize(6000, false);
ListColumnVector list2 = (ListColumnVector) batch.cols[1];
BytesColumnVector bin = (BytesColumnVector) list2.child;
bin.ensureSize(6000, false);
list1.offsets[0] = 0;
list1.lengths[0] = 2000;
list2.offsets[1] = 2000;
list2.lengths[1] = 3000;
for(int v=0; v < 5000; ++v) {
byte[] bytes = Long.toHexString(v).getBytes(StandardCharsets.UTF_8);
str.setVal(v, bytes);
bin.setVal(v, bytes);
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
assertEquals(2, batch.size);
public void testWriterVersion() throws Exception {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
// test writer implementation serialization
// test writer version serialization
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 99));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 0));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 2));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_CPP, 99));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_CPP, 6));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6));
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0));
// test compatibility
public void testBadPrestoVersion() {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 0);
* Test whether the file versions are translated correctly
* @throws Exception
public void testFileVersion() throws Exception {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(null));
assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(new ArrayList<Integer>()));
ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 11})));
ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 12})));
ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{9999, 0})));
public void testMergeUnderstood() throws Exception {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
Path p = new Path("test.orc");
Reader futureVersion = Mockito.mock(Reader.class);
assertEquals(false, OrcFile.understandFormat(p, futureVersion));
Reader futureWriter = Mockito.mock(Reader.class);
assertEquals(false, OrcFile.understandFormat(p, futureWriter));
Reader current = Mockito.mock(Reader.class);
assertEquals(true, OrcFile.understandFormat(p, current));
static ByteBuffer fromString(String s) {
return ByteBuffer.wrap(s.getBytes(StandardCharsets.UTF_8));
static byte[] fromLong(long x) {
return Long.toHexString(x).getBytes(StandardCharsets.UTF_8);
public void testMerge() throws Exception {
Path input1 = new Path(workDir, "TestVectorOrcFile.testMerge1-" +
fileFormat.getName() + ".orc");
fs.delete(input1, false);
Path input2 = new Path(workDir, "TestVectorOrcFile.testMerge2-" +
fileFormat.getName() + ".orc");
fs.delete(input2, false);
Path input3 = new Path(workDir, "TestVectorOrcFile.testMerge3-" +
fileFormat.getName() + ".orc");
fs.delete(input3, false);
TypeDescription schema = TypeDescription.fromString("struct<a:int,b:string>");
// change all of the options away from default to find anything we
// don't copy to the merged file
OrcFile.WriterOptions opts = OrcFile.writerOptions(conf)
Writer writer = OrcFile.createWriter(input1, opts);
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 1024;
for(int r=0; r < 1024; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = r;
((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(r));
writer.addUserMetadata("a", fromString("foo"));
writer.addUserMetadata("b", fromString("bar"));
// increase the buffer size to 30k
writer = OrcFile.createWriter(input2, opts);
batch.size = 1024;
for(int r=0; r < 1024; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = 2 * r;
((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(2 * r));
writer.addUserMetadata("a", fromString("foo"));
writer.addUserMetadata("c", fromString("baz"));
// decrease the buffer size to 10k
writer = OrcFile.createWriter(input3, opts);
batch.size = 1024;
for(int r=0; r < 1024; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = 3 * r;
((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(3 * r));
writer.addUserMetadata("c", fromString("baz"));
writer.addUserMetadata("d", fromString("bat"));
Path output1 = new Path(workDir, "TestVectorOrcFile.testMerge.out1-" +
fileFormat.getName() + ".orc");
fs.delete(output1, false);
List<Path> paths = OrcFile.mergeFiles(output1,
OrcFile.writerOptions(conf), Arrays.asList(input1, input2, input3));
assertEquals(3, paths.size());
Reader reader = OrcFile.createReader(output1, OrcFile.readerOptions(conf));
assertEquals(3 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.LZO, reader.getCompressionKind());
assertEquals(30 * 1024, reader.getCompressionSize());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(fileFormat, reader.getFileVersion());
assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion());
assertEquals(3, reader.getStripes().size());
assertEquals(4, reader.getMetadataKeys().size());
assertEquals(fromString("foo"), reader.getMetadataValue("a"));
assertEquals(fromString("bar"), reader.getMetadataValue("b"));
assertEquals(fromString("baz"), reader.getMetadataValue("c"));
assertEquals(fromString("bat"), reader.getMetadataValue("d"));
TypeDescription schema4 = TypeDescription.fromString("struct<a:int>");
Path input4 = new Path(workDir, "TestVectorOrcFile.testMerge4-" +
fileFormat.getName() + ".orc");
fs.delete(input4, false);
writer = OrcFile.createWriter(input4, opts);
batch = schema4.createRowBatch();
batch.size = 1024;
for(int r=0; r < 1024; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = 4 * r;
Path input5 = new Path(workDir, "TestVectorOrcFile.testMerge5-" +
fileFormat.getName() + ".orc");
fs.delete(input5, false);
writer = OrcFile.createWriter(input5, opts);
batch = schema.createRowBatch();
batch.size = 1024;
for(int r=0; r < 1024; ++r) {
((LongColumnVector) batch.cols[0]).vector[r] = 4 * r;
((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(5 * r));
Path output2 = new Path(workDir, "TestVectorOrcFile.testMerge.out2-" +
fileFormat.getName() + ".orc");
fs.delete(output2, false);
paths = OrcFile.mergeFiles(output2, OrcFile.writerOptions(conf),
Arrays.asList(input3, input4, input1, input5));
assertEquals(2, paths.size());
reader = OrcFile.createReader(output2, OrcFile.readerOptions(conf));
assertEquals(2 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.LZO, reader.getCompressionKind());
assertEquals(20 * 1024, reader.getCompressionSize());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(fileFormat, reader.getFileVersion());
assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion());
assertEquals(2, reader.getStripes().size());
assertEquals(4, reader.getMetadataKeys().size());
assertEquals(fromString("foo"), reader.getMetadataValue("a"));
assertEquals(fromString("bar"), reader.getMetadataValue("b"));
assertEquals(fromString("baz"), reader.getMetadataValue("c"));
assertEquals(fromString("bat"), reader.getMetadataValue("d"));
* Write a mergeable file to test merging files with column encryption.
* @param path the path to write to
* @param provider the key provider
* @param startValue the base value for the columns
* @param stripes the number of stripes to write
* @param bufferSize the buffer size to use for the compression
* @param encrypt the encryption string
* @param mask the mask string
* @return the locations of the intermediate stripes
* @throws IOException
private long[] writeMergeableFile(Path path,
KeyProvider provider,
long startValue,
int stripes,
int bufferSize,
String encrypt,
String mask) throws IOException {
fs.delete(path, false);
TypeDescription schema = TypeDescription.fromString(
// change all of the options away from default to find anything we
// don't copy to the merged file
OrcFile.WriterOptions opts = OrcFile.writerOptions(conf)
long[] intermediateFooters = new long[stripes];
Writer writer = OrcFile.createWriter(path, opts);
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector a = (LongColumnVector) batch.cols[0];
StructColumnVector b = (StructColumnVector) batch.cols[1];
BytesColumnVector c = (BytesColumnVector) b.fields[0];
BytesColumnVector d = (BytesColumnVector) b.fields[1];
batch.size = 1024;
for(int btch=0; btch < 3; ++btch) {
for (int r = 0; r < 1024; ++r) {
long value = startValue + btch * 1024 + r;
a.vector[r] = value;
c.setVal(r, fromLong(value));
d.setVal(r, String.format("%010x", value * 1_000_001)
// write an intermediate footer to force a stripe
intermediateFooters[btch] = writer.writeIntermediateFooter();
return intermediateFooters;
static String computeSha(String value) {
try {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] digest = md.digest(value.getBytes(StandardCharsets.UTF_8));
return DatatypeConverter.printHexBinary(digest);
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
public void testEncryptMerge() throws Exception {
Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
Path input1 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge1-" +
fileFormat.getName() + ".orc");
Path input2 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge2-" +
fileFormat.getName() + ".orc");
Path input3 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge3-" +
fileFormat.getName() + ".orc");
Path input4 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge4-" +
fileFormat.getName() + ".orc");
Path input5 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge5-" +
fileFormat.getName() + ".orc");
Random random = new Random(169);
InMemoryKeystore keystore = new InMemoryKeystore(random);
EncryptionAlgorithm algorithm = EncryptionAlgorithm.AES_CTR_128;
byte[] piiKey = new byte[algorithm.keyLength()];
byte[] topSecretKey = new byte[algorithm.keyLength()];
keystore.addKey("pii", algorithm, piiKey)
.addKey("top_secret", algorithm, topSecretKey);
String encryption = "pii:a;top_secret:b";
String mask = "sha256,`don't worry`:b";
// write three files that should merge, each with 3 stripes of 1024 rows.
long[] cuts = writeMergeableFile(input1, keystore, 0, 3, 0x400, encryption, mask);
writeMergeableFile(input2, keystore, 3 * 1024, 3, 0x800, encryption, mask);
writeMergeableFile(input3, keystore, 6 * 1024, 3, 0xc00, encryption, mask);
// two files that aren't mergeable
writeMergeableFile(input4, keystore, 9 * 1024, 3, 0x400, encryption, null);
writeMergeableFile(input5, keystore, 12 * 1024, 3, 0x400, null, null);
// make sure that we can read up to the intermediate footers
try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf)
.maxLength(cuts[0]))) {
assertEquals(1024, reader.getNumberOfRows());
try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf)
.maxLength(cuts[1]))) {
assertEquals(2 * 1024, reader.getNumberOfRows());
try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf)
.maxLength(cuts[2]))) {
assertEquals(3 * 1024, reader.getNumberOfRows());
// make a new version of the pii key
keystore.addKey("pii", 1, algorithm, new byte[algorithm.keyLength()]);
Path merge1 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge.merge1-" +
fileFormat.getName() + ".orc");
// merge all three files together
fs.delete(merge1, false);
List<Path> paths = OrcFile.mergeFiles(merge1,
Arrays.asList(input1, input2, input3));
assertEquals(3, paths.size());
// test reading with no keys
Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf));
assertEquals(9 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(0xc00, reader.getCompressionSize());
assertEquals(fileFormat, reader.getFileVersion());
assertEquals(9, reader.getStripes().size());
EncryptionKey[] keys = reader.getColumnEncryptionKeys();
assertEquals(2, keys.length);
assertEquals("pii", keys[0].getKeyName());
assertEquals(0, keys[0].getKeyVersion());
assertEquals(false, keys[0].isAvailable());
assertEquals("top_secret", keys[1].getKeyName());
assertEquals(0, keys[1].getKeyVersion());
assertEquals(false, keys[1].isAvailable());
// check the file stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(9 * 1024, stats[0].getNumberOfValues());
assertEquals(0, stats[1].getNumberOfValues());
assertEquals(9 * 1024, stats[2].getNumberOfValues());
assertEquals(9 * 1024, stats[3].getNumberOfValues());
((StringColumnStatistics) stats[3]).getMinimum());
((StringColumnStatistics) stats[3]).getMaximum());
assertEquals(9 * 1024, stats[4].getNumberOfValues());
((StringColumnStatistics) stats[4]).getMinimum());
((StringColumnStatistics) stats[4]).getMaximum());
// check the stripe stats
List<StripeStatistics> stripeStats = reader.getStripeStatistics();
for(int s=0; s < stripeStats.size(); ++s) {
ColumnStatistics[] cs = stripeStats.get(s).getColumnStatistics();
assertEquals("stripe " + s, 1024, cs[0].getNumberOfValues());
assertEquals("stripe " + s, 0, cs[1].getNumberOfValues());
assertEquals("stripe " + s, 1024, cs[2].getNumberOfValues());
assertEquals("stripe " + s, 1024, cs[3].getNumberOfValues());
assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[3]).getMinimum().length());
assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[3]).getMaximum().length());
assertEquals("stripe " + s, 1024, cs[4].getNumberOfValues());
assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[4]).getMinimum().length());
assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[4]).getMaximum().length());
// check the file contents
RecordReader rows = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatchV2();
LongColumnVector a = (LongColumnVector) batch.cols[0];
StructColumnVector b = (StructColumnVector) batch.cols[1];
BytesColumnVector c = (BytesColumnVector) b.fields[0];
BytesColumnVector d = (BytesColumnVector) b.fields[1];
for(int btch=0; btch < 9; ++btch) {
assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < batch.size; ++r) {
long value = btch * 1024 + r;
assertEquals("batch " + btch + " row " + r, true, a.isNull[r]);
assertEquals("batch " + btch + " row " + r,
computeSha(Long.toHexString(value)), c.toString(r));
assertEquals("batch " + btch + " row " + r,
computeSha(String.format("%010x", value * 1_000_001)),
assertEquals(false, rows.nextBatch(batch));
// test reading with keys
reader = OrcFile.createReader(merge1,
assertEquals(9 * 1024, reader.getNumberOfRows());
keys = reader.getColumnEncryptionKeys();
assertEquals(2, keys.length);
assertEquals("pii", keys[0].getKeyName());
assertEquals(0, keys[0].getKeyVersion());
assertEquals(true, keys[0].isAvailable());
assertEquals("top_secret", keys[1].getKeyName());
assertEquals(0, keys[1].getKeyVersion());
assertEquals(true, keys[1].isAvailable());
// check the file stats
stats = reader.getStatistics();
assertEquals(9 * 1024, stats[0].getNumberOfValues());
assertEquals(9 * 1024, stats[1].getNumberOfValues());
assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
assertEquals(9 * 1024 - 1, ((IntegerColumnStatistics) stats[1]).getMaximum());
assertEquals(9 * 1024, stats[2].getNumberOfValues());
assertEquals(9 * 1024, stats[3].getNumberOfValues());
assertEquals("0", ((StringColumnStatistics) stats[3]).getMinimum());
assertEquals("fff", ((StringColumnStatistics) stats[3]).getMaximum());
assertEquals(9 * 1024, stats[4].getNumberOfValues());
assertEquals("0000000000", ((StringColumnStatistics) stats[4]).getMinimum());
assertEquals("022541e1bf", ((StringColumnStatistics) stats[4]).getMaximum());
// check the stripe stats
stripeStats = reader.getStripeStatistics();
for(int s=0; s < stripeStats.size(); ++s) {
long low = s * 1024;
long high = s * 1024 + 1023;
ColumnStatistics[] cs = stripeStats.get(s).getColumnStatistics();
assertEquals("stripe " + s, 1024, cs[0].getNumberOfValues());
assertEquals("stripe " + s, 1024, cs[1].getNumberOfValues());
assertEquals("stripe " + s, low, ((IntegerColumnStatistics) cs[1]).getMinimum());
assertEquals("stripe " + s, high, ((IntegerColumnStatistics) cs[1]).getMaximum());
assertEquals("stripe " + s, 1024, cs[2].getNumberOfValues());
assertEquals("stripe " + s, 1024, cs[3].getNumberOfValues());
assertEquals("stripe " + s, Long.toHexString(low),
((StringColumnStatistics) cs[3]).getMinimum());
assertEquals("stripe " + s, s == 0 ? "ff" : Long.toHexString(high),
((StringColumnStatistics) cs[3]).getMaximum());
assertEquals("stripe " + s, 1024, cs[4].getNumberOfValues());
assertEquals("stripe " + s, String.format("%010x", 1_000_001 * low),
((StringColumnStatistics) cs[4]).getMinimum());
assertEquals("stripe " + s, String.format("%010x", 1_000_001 * high),
((StringColumnStatistics) cs[4]).getMaximum());
// check the file contents
rows = reader.rows();
for(int btch=0; btch < 9; ++btch) {
assertEquals(true, rows.nextBatch(batch));
assertEquals(1024, batch.size);
for(int r=0; r < batch.size; ++r) {
long value = btch * 1024 + r;
assertEquals("batch " + btch + " row " + r, value, a.vector[r]);
assertEquals("batch " + btch + " row " + r, Long.toHexString(value),
assertEquals("batch " + btch + " row " + r,
String.format("%010x", value * 1_000_001), d.toString(r));
assertEquals(false, rows.nextBatch(batch));
Path merge2 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge.merge2-" +
fileFormat.getName() + ".orc");
fs.delete(merge2, false);
paths = OrcFile.mergeFiles(merge2,
Arrays.asList(input2, input4, input1, input5));
// make sure only input1 & input2 were merged
assertEquals(2, paths.size());
assertEquals(true, paths.contains(input1));
assertEquals(true, paths.contains(input2));
reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf));
assertEquals(2 * 3 * 1024, reader.getNumberOfRows());
assertEquals(CompressionKind.ZLIB, reader.getCompressionKind());
assertEquals(0x800, reader.getCompressionSize());
assertEquals(1000, reader.getRowIndexStride());
assertEquals(fileFormat, reader.getFileVersion());
assertEquals(6, reader.getStripes().size());
assertEquals(2, reader.getColumnEncryptionKeys().length);
assertEquals(2, reader.getDataMasks().length);
assertEquals(2, reader.getEncryptionVariants().length);
Path exampleDir = new Path(System.getProperty("example.dir",
public void testZeroByteOrcFile() throws Exception {
// we only have to run this test once, since it is a 0 byte file.
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
Path zeroFile = new Path(exampleDir, "zero.orc");
Reader reader = OrcFile.createReader(zeroFile, OrcFile.readerOptions(conf));
assertEquals(0, reader.getNumberOfRows());
assertEquals("struct<>", reader.getSchema().toString());
assertEquals(CompressionKind.NONE, reader.getCompressionKind());
assertEquals(0, reader.getRawDataSize());
assertEquals(0, reader.getRowIndexStride());
assertEquals(0, reader.getCompressionSize());
assertEquals(0, reader.getMetadataSize());
assertEquals(OrcFile.Version.CURRENT, reader.getFileVersion());
assertEquals(0, reader.getStripes().size());
assertEquals(0, reader.getStatistics().length);
assertEquals(0, reader.getMetadataKeys().size());
assertEquals(OrcFile.CURRENT_WRITER, reader.getWriterVersion());
VectorizedRowBatch batch =
assertEquals(false, reader.rows().nextBatch(batch));
public void testFutureOrcFile() throws Exception {
Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11);
Path zeroFile = new Path(exampleDir, "version1999.orc");
try {
OrcFile.createReader(zeroFile, OrcFile.readerOptions(conf));
assertTrue("no exception for bad version", false);
} catch (UnknownFormatException uf) {
assertEquals("path is correct", "version1999.orc", uf.getPath().getName());
assertEquals("19.99", uf.getVersionString());
OrcProto.PostScript ps = uf.getPostscript();
assertEquals("ORC", ps.getMagic());
assertEquals(OrcProto.CompressionKind.NONE, ps.getCompression());
public void testEmptyDoubleStream() throws Exception {
TypeDescription schema =
TypeDescription.fromString("struct<list1:array<double>," +
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 2;
ListColumnVector list1 = (ListColumnVector) batch.cols[0];
ListColumnVector list2 = (ListColumnVector) batch.cols[1];
for(int r=0; r < batch.size; ++r) {
list1.offsets[r] = 0;
list1.lengths[r] = 0;
list2.offsets[r] = 0;
list2.lengths[r] = 0;
Reader reader = OrcFile.createReader(testFilePath,
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
assertEquals(2, batch.size);
list1 = (ListColumnVector) batch.cols[0];
list2 = (ListColumnVector) batch.cols[1];
for(int r=0; r < batch.size; ++r) {
assertEquals(0, list1.lengths[r]);
assertEquals(0, list2.lengths[r]);
public void testPredicatePushdownForComplex() throws Exception {
TypeDescription schema = createComplexInnerSchema();
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 3500;
for(int i=0; i < 3500; ++i) {
((LongColumnVector) batch.cols[0]).vector[i] = i;
((LongColumnVector)((StructColumnVector) batch.cols[1]).fields[0]).vector[i] = i * 300;
((BytesColumnVector)((StructColumnVector) batch.cols[1]).fields[1]).setVal(i,
Reader reader = OrcFile.createReader(testFilePath,
assertEquals(3500, reader.getNumberOfRows());
SearchArgument sarg = SearchArgumentFactory.newBuilder()
.lessThan("complex.int2", PredicateLeaf.Type.LONG, 300000L)
.lessThan("complex.int2", PredicateLeaf.Type.LONG, 600000L)
RecordReader rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true, true, true})
.searchArgument(sarg, new String[]{null, "int1", "complex","int2","string1"}));
batch = reader.getSchema().createRowBatch(2000);
LongColumnVector ints1 = (LongColumnVector) batch.cols[0];
StructColumnVector struct1 = (StructColumnVector) batch.cols[1];
LongColumnVector ints2 = (LongColumnVector) struct1.fields[0];
BytesColumnVector strs = (BytesColumnVector) struct1.fields[1];
Assert.assertEquals(1000L, rows.getRowNumber());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
for(int i=1000; i < 2000; ++i) {
assertEquals(300 * i, ints2.vector[i - 1000]);
assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000));
Assert.assertEquals(false, rows.nextBatch(batch));
Assert.assertEquals(3500, rows.getRowNumber());
// look through the file with no rows selected
sarg = SearchArgumentFactory.newBuilder()
.lessThan("complex.int2", PredicateLeaf.Type.LONG, 0L)
rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true, true, true, true})
.searchArgument(sarg, new String[]{null, "int1",null,"int2","string1"}));
Assert.assertEquals(3500L, rows.getRowNumber());
// select first 100 and last 100 rows
sarg = SearchArgumentFactory.newBuilder()
.lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 100)
.lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 3400)
rows = reader.rows(reader.options()
.range(0L, Long.MAX_VALUE)
.include(new boolean[]{true, true,true,true, true})
.searchArgument(sarg, new String[]{null, "int1",null, "int2","string1"}));
Assert.assertEquals(0, rows.getRowNumber());
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
Assert.assertEquals(3000, rows.getRowNumber());
for(int i=0; i < 1000; ++i) {
assertEquals(300 * i, ints2.vector[i]);
assertEquals(Integer.toHexString(10*i), strs.toString(i));
Assert.assertEquals(true, rows.nextBatch(batch));
assertEquals(500, batch.size);
Assert.assertEquals(3500, rows.getRowNumber());
for(int i=3000; i < 3500; ++i) {
assertEquals(300 * i, ints2.vector[i - 3000]);
assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000));
Assert.assertEquals(false, rows.nextBatch(batch));
Assert.assertEquals(3500, rows.getRowNumber());
public void testColumnEncryption() throws Exception {
Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
final int ROWS = 1000;
final int SEED = 2;
final Random random = new Random(SEED);
TypeDescription schema =
byte[] piiKey = new byte[16];
byte[] creditKey = new byte[32];
InMemoryKeystore keys = new InMemoryKeystore(random)
.addKey("pii", EncryptionAlgorithm.AES_CTR_128, piiKey)
.addKey("credit", EncryptionAlgorithm.AES_CTR_256, creditKey);
Writer writer = OrcFile.createWriter(testFilePath,
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = ROWS;
LongColumnVector i = (LongColumnVector) batch.cols[0];
LongColumnVector norm = (LongColumnVector) batch.cols[1];
ListColumnVector x = (ListColumnVector) batch.cols[2];
BytesColumnVector xElem = (BytesColumnVector) x.child;
xElem.ensureSize(3 * ROWS, false);
for(int r=0; r < ROWS; ++r) {
i.vector[r] = r * 3;
norm.vector[r] = r * 5;
int start = x.childCount;
x.offsets[r] = start;
x.lengths[r] = 3;
x.childCount += x.lengths[r];
for(int child=0; child < x.lengths[r]; ++child) {
xElem.setVal(start + child,
String.format("%d.%d", r, child).getBytes(StandardCharsets.UTF_8));
// Read without any keys
Reader reader = OrcFile.createReader(testFilePath,
.setKeyProvider(new InMemoryKeystore()));
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(ROWS, stats[0].getNumberOfValues());
assertEquals(0, stats[1].getNumberOfValues());
assertEquals(true, stats[1].hasNull());
assertEquals(ROWS, stats[2].getNumberOfValues());
assertEquals(0, ((IntegerColumnStatistics) stats[2]).getMinimum());
assertEquals(ROWS * 5 - 5, ((IntegerColumnStatistics) stats[2]).getMaximum());
assertEquals(0, stats[3].getNumberOfValues());
assertEquals(0, stats[4].getNumberOfValues());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
i = (LongColumnVector) batch.cols[0];
norm = (LongColumnVector) batch.cols[1];
x = (ListColumnVector) batch.cols[2];
// ensure that we get the right number of rows with all nulls
assertEquals(ROWS, batch.size);
assertEquals(true, i.isRepeating);
assertEquals(false, i.noNulls);
assertEquals(true, i.isNull[0]);
assertEquals(true, x.isRepeating);
assertEquals(false, x.noNulls);
assertEquals(true, x.isNull[0]);
for(int r=0; r < ROWS; ++r) {
assertEquals("row " + r, r * 5, norm.vector[r]);
// Add a new version of the pii key
keys.addKey("pii", 1, EncryptionAlgorithm.AES_CTR_128, piiKey);
// Read with the keys
reader = OrcFile.createReader(testFilePath,
stats = reader.getStatistics();
assertEquals(ROWS, stats[0].getNumberOfValues());
assertEquals(ROWS, stats[1].getNumberOfValues());
assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum());
assertEquals(3 * (ROWS - 1), ((IntegerColumnStatistics) stats[1]).getMaximum());
assertEquals(0, ((IntegerColumnStatistics) stats[2]).getMinimum());
assertEquals(5 * (ROWS - 1), ((IntegerColumnStatistics) stats[2]).getMaximum());
assertEquals(ROWS, stats[3].getNumberOfValues());
assertEquals(3 * ROWS, stats[4].getNumberOfValues());
assertEquals("0.0", ((StringColumnStatistics)stats[4]).getMinimum());
assertEquals("999.2", ((StringColumnStatistics)stats[4]).getMaximum());
rows = reader.rows();
batch = reader.getSchema().createRowBatch();
i = (LongColumnVector) batch.cols[0];
norm = (LongColumnVector) batch.cols[1];
x = (ListColumnVector) batch.cols[2];
xElem = (BytesColumnVector) x.child;
assertEquals(ROWS, batch.size);
assertEquals(false, i.isRepeating);
assertEquals(false, x.isRepeating);
assertEquals(false, xElem.isRepeating);
assertEquals(true, i.noNulls);
assertEquals(true, x.noNulls);
assertEquals(true, xElem.noNulls);
for(int r=0; r < ROWS; ++r) {
assertEquals("row " + r, r * 3, i.vector[r]);
assertEquals("row " + r, r * 5, norm.vector[r]);
assertEquals("row " + r, r * 3, x.offsets[r]);
assertEquals("row " + r, 3, x.lengths[r]);
for(int child=0; child < x.lengths[r]; ++child) {
assertEquals("row " + r + "." + child, String.format("%d.%d", r, child),
xElem.toString((int) x.offsets[r] + child));
public void testMultiStripeColumnEncryption() throws Exception {
Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11);
final EncryptionAlgorithm algorithm = EncryptionAlgorithm.AES_CTR_128;
final int BATCHES = 100;
final int SEED = 3;
final Random random = new Random(SEED);
TypeDescription schema = TypeDescription.fromString(
"struct<dec:decimal(20,4)," +
"dt:date," +
"time:timestamp," +
"dbl:double," +
"bool:boolean," +
InMemoryKeystore allKeys = new InMemoryKeystore();
byte[][] masterKeys = new byte[6][];
for(int k=0; k < masterKeys.length; ++k) {
masterKeys[k] = new byte[algorithm.keyLength()];
allKeys.addKey("key_" + k, algorithm, masterKeys[k]);
Writer writer = OrcFile.createWriter(testFilePath,
// Set size to 1000 precisely so that stripes are exactly 5000 rows long.
VectorizedRowBatch batch = schema.createRowBatch(1000);
DecimalColumnVector dec = (DecimalColumnVector) batch.cols[0];
LongColumnVector dt = (LongColumnVector) batch.cols[1];
TimestampColumnVector time = (TimestampColumnVector) batch.cols[2];
DoubleColumnVector dbl = (DoubleColumnVector) batch.cols[3];
LongColumnVector bool = (LongColumnVector) batch.cols[4];
BytesColumnVector bin = (BytesColumnVector) batch.cols[5];
// Generate 100 batches of 1,000 rows each
batch.size = 1000;
dec.isRepeating = true;
dt.isRepeating = true;
time.isRepeating = true;
dbl.isRepeating = true;
bool.isRepeating = true;
bin.isRepeating = true;
for(int b=0; b < BATCHES; ++b) {
dec.set(0, new HiveDecimalWritable(String.format("%d.%03d", b, b)));
dt.vector[0] = new DateWritable(new Date(96 + b, 12, 11)).getDays();
time.set(0, Timestamp.valueOf(String.format("2014-12-14 12:00:00.%04d", b)));
dbl.vector[0] = b + 0.5;
bool.vector[0] = b % 2;
bin.setVal(0, Integer.toString(b).getBytes(StandardCharsets.UTF_8));
// Read without any keys
Reader reader = OrcFile.createReader(testFilePath,
.setKeyProvider(new InMemoryKeystore()));
checkHasData(reader.rows(), batch, BATCHES,
false, false, false, false, false, false);
// read with all of the keys
reader = OrcFile.createReader(testFilePath,
checkHasData(reader.rows(), batch, BATCHES,
true, true, true, true, true, true);
// try enabling each key by itself
for(int c=0; c < 6; ++c) {
InMemoryKeystore single = new InMemoryKeystore();
single.addKey("key_" + c, algorithm, masterKeys[c]);
reader = OrcFile.createReader(testFilePath,
boolean[] hasData = new boolean[6];
hasData[c] = true;
checkHasData(reader.rows(), batch, BATCHES, hasData);
private void checkHasData(RecordReader reader, VectorizedRowBatch batch,
int BATCHES, boolean... hasData) throws IOException {
for(int b=0; b < BATCHES; ++b) {
assertEquals("batch " + b, true, reader.nextBatch(batch));
for(int c=0; c < hasData.length; c++) {
if (hasData[c]) {
// the expected value
String expected = null;
// a function from the row to the value as a string
IntFunction<String> actual = row -> null;
switch (c) {
case 0:
expected = new HiveDecimalWritable(String.format("%d.%03d", b, b)).toString();
actual = row -> ((DecimalColumnVector) batch.cols[0]).vector[row].toString();
case 1:
expected = Long.toString(new DateWritable(new Date(96 + b, 12, 11)).getDays());
actual = row -> Long.toString(((LongColumnVector) batch.cols[1]).vector[row]);
case 2:
expected = Timestamp.valueOf(String.format("2014-12-14 12:00:00.%04d", b)).toString();
actual = row -> ((TimestampColumnVector) batch.cols[2]).asScratchTimestamp(row).toString();
case 3:
expected = Double.toString(b + 0.5);
actual = row -> Double.toString(((DoubleColumnVector) batch.cols[3]).vector[row]);
case 4:
expected = Long.toString(b % 2);
actual = row -> Long.toString(((LongColumnVector) batch.cols[4]).vector[row]);
expected = Integer.toString(b);
actual = row -> ((BytesColumnVector) batch.cols[5]).toString(row);
assertEquals("batch " + b + " column " + c, true, batch.cols[c].noNulls);
assertEquals("batch " + b + " column " + c + " row 0", expected, actual.apply(0));
// Not all of the readers set isRepeating, so if it isn't set, check the values.
if (!batch.cols[c].isRepeating) {
for(int r=1; r < batch.size; ++r) {
assertEquals("batch " + b + " column " + c + " row " + r, expected, actual.apply(r));
} else {
assertEquals("batch " + b + " column " + c, true, batch.cols[c].isRepeating);
assertEquals("batch " + b + " column " + c, true, batch.cols[c].isNull[0]);
assertEquals("end", false, reader.nextBatch(batch));