blob: b28169b2318f99ccbf6795ebb9a05cf6a3334d4d [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import static junit.framework.Assert.*;
import static junit.framework.Assert.assertEquals;
/**
* Tests for the top level reader/streamFactory of ORC files.
*/
public class TestOrcFile {
public static class InnerStruct {
int int1;
Text string1 = new Text();
InnerStruct(int int1, String string1) {
this.int1 = int1;
this.string1.set(string1);
}
}
public static class MiddleStruct {
List<InnerStruct> list = new ArrayList<InnerStruct>();
MiddleStruct(InnerStruct... items) {
list.clear();
for(InnerStruct item: items) {
list.add(item);
}
}
}
public static class BigRow {
Boolean boolean1;
Byte byte1;
Short short1;
Integer int1;
Long long1;
Float float1;
Double double1;
BytesWritable bytes1;
Text string1;
MiddleStruct middle;
List<InnerStruct> list = new ArrayList<InnerStruct>();
Map<Text, InnerStruct> map = new HashMap<Text, InnerStruct>();
BigRow(Boolean b1, Byte b2, Short s1, Integer i1, Long l1, Float f1,
Double d1,
BytesWritable b3, String s2, MiddleStruct m1,
List<InnerStruct> l2, Map<Text, InnerStruct> m2) {
this.boolean1 = b1;
this.byte1 = b2;
this.short1 = s1;
this.int1 = i1;
this.long1 = l1;
this.float1 = f1;
this.double1 = d1;
this.bytes1 = b3;
if (s2 == null) {
this.string1 = null;
} else {
this.string1 = new Text(s2);
}
this.middle = m1;
this.list = l2;
this.map = m2;
}
}
private static InnerStruct inner(int i, String s) {
return new InnerStruct(i, s);
}
private static Map<Text, InnerStruct> map(InnerStruct... items) {
Map<Text, InnerStruct> result = new HashMap<Text, InnerStruct>();
for(InnerStruct i: items) {
result.put(new Text(i.string1), i);
}
return result;
}
private static List<InnerStruct> list(InnerStruct... items) {
List<InnerStruct> result = new ArrayList<InnerStruct>();
for(InnerStruct s: items) {
result.add(s);
}
return result;
}
private static BytesWritable bytes(int... items) {
BytesWritable result = new BytesWritable();
result.setSize(items.length);
for(int i=0; i < items.length; ++i) {
result.getBytes()[i] = (byte) items[i];
}
return result;
}
private static ByteBuffer byteBuf(int... items) {
ByteBuffer result = ByteBuffer.allocate(items.length);
for(int item: items) {
result.put((byte) item);
}
return result;
}
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
Configuration conf;
FileSystem fs;
Path testFilePath;
@Rule
public TestName testCaseName = new TestName();
@Before
public void openFileSystem () throws Exception {
conf = new Configuration();
fs = FileSystem.getLocal(conf);
testFilePath = new Path(workDir, "TestOrcFile." +
testCaseName.getMethodName() + ".orc");
fs.delete(testFilePath, false);
}
@Test
public void test1() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
100000, CompressionKind.ZLIB, 10000, 10000);
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536,
Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0,1,2,3,4), "hi",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(3, "good"), inner(4, "bad")),
map()));
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536,
Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
map(inner(5,"chani"), inner(1,"mauddib"))));
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 min: 1024 max: 2048 sum: 3072",
stats[3].toString());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 2 min: 9223372036854775807 max: 9223372036854775807",
stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 min: -15.0 max: -5.0 sum: -20.0",
stats[7].toString());
assertEquals("count: 2 min: bye max: hi", stats[9].toString());
// check the inspectors
StructObjectInspector readerInspector =
(StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT,
readerInspector.getCategory());
assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint,"
+ "int1:int,long1:bigint,float1:float,double1:double,bytes1:"
+ "binary,string1:string,middle:struct<list:array<struct<int1:int,"
+ "string1:string>>>,list:array<struct<int1:int,string1:string>>,"
+ "map:map<string,struct<int1:int,string1:string>>>",
readerInspector.getTypeName());
List<? extends StructField> fields =
readerInspector.getAllStructFieldRefs();
BooleanObjectInspector bo = (BooleanObjectInspector) readerInspector.
getStructFieldRef("boolean1").getFieldObjectInspector();
ByteObjectInspector by = (ByteObjectInspector) readerInspector.
getStructFieldRef("byte1").getFieldObjectInspector();
ShortObjectInspector sh = (ShortObjectInspector) readerInspector.
getStructFieldRef("short1").getFieldObjectInspector();
IntObjectInspector in = (IntObjectInspector) readerInspector.
getStructFieldRef("int1").getFieldObjectInspector();
LongObjectInspector lo = (LongObjectInspector) readerInspector.
getStructFieldRef("long1").getFieldObjectInspector();
FloatObjectInspector fl = (FloatObjectInspector) readerInspector.
getStructFieldRef("float1").getFieldObjectInspector();
DoubleObjectInspector dbl = (DoubleObjectInspector) readerInspector.
getStructFieldRef("double1").getFieldObjectInspector();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.
getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.
getStructFieldRef("string1").getFieldObjectInspector();
StructObjectInspector mid = (StructObjectInspector) readerInspector.
getStructFieldRef("middle").getFieldObjectInspector();
List<? extends StructField> midFields =
mid.getAllStructFieldRefs();
ListObjectInspector midli =
(ListObjectInspector) midFields.get(0).getFieldObjectInspector();
StructObjectInspector inner = (StructObjectInspector)
midli.getListElementObjectInspector();
List<? extends StructField> inFields = inner.getAllStructFieldRefs();
ListObjectInspector li = (ListObjectInspector) readerInspector.
getStructFieldRef("list").getFieldObjectInspector();
MapObjectInspector ma = (MapObjectInspector) readerInspector.
getStructFieldRef("map").getFieldObjectInspector();
StructObjectInspector lc = (StructObjectInspector)
li.getListElementObjectInspector();
StringObjectInspector mk = (StringObjectInspector)
ma.getMapKeyObjectInspector();
StructObjectInspector mv = (StructObjectInspector)
ma.getMapValueObjectInspector();
RecordReader rows = reader.rows(null);
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(false,
bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(1, by.get(readerInspector.getStructFieldData(row,
fields.get(1))));
assertEquals(1024, sh.get(readerInspector.getStructFieldData(row,
fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row,
fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.
getStructFieldData(row, fields.get(4))));
assertEquals(1.0, fl.get(readerInspector.getStructFieldData(row,
fields.get(5))), 0.00001);
assertEquals(-15.0, dbl.get(readerInspector.getStructFieldData(row,
fields.get(6))), 0.00001);
assertEquals(bytes(0,1,2,3,4), bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(8))));
List<?> midRow = midli.getList(mid.getStructFieldData(readerInspector.
getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0),
inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData
(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1),
inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData
(midRow.get(1), inFields.get(1))));
List<?> list = li.getList(readerInspector.getStructFieldData(row,
fields.get(10)));
assertEquals(2, list.size());
assertEquals(3, in.get(inner.getStructFieldData(list.get(0),
inFields.get(0))));
assertEquals("good", st.getPrimitiveJavaObject(inner.getStructFieldData
(list.get(0), inFields.get(1))));
assertEquals(4, in.get(inner.getStructFieldData(list.get(1),
inFields.get(0))));
assertEquals("bad", st.getPrimitiveJavaObject(inner.getStructFieldData
(list.get(1), inFields.get(1))));
Map<?,?> map = ma.getMap(readerInspector.getStructFieldData(row,
fields.get(11)));
assertEquals(0, map.size());
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(true,
bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(100, by.get(readerInspector.getStructFieldData(row,
fields.get(1))));
assertEquals(2048, sh.get(readerInspector.getStructFieldData(row,
fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row,
fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.
getStructFieldData(row, fields.get(4))));
assertEquals(2.0, fl.get(readerInspector.getStructFieldData(row,
fields.get(5))), 0.00001);
assertEquals(-5.0, dbl.get(readerInspector.getStructFieldData(row,
fields.get(6))), 0.00001);
assertEquals(bytes(), bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("bye", st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(8))));
midRow = midli.getList(mid.getStructFieldData(readerInspector.
getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0),
inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData
(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1),
inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData
(midRow.get(1), inFields.get(1))));
list = li.getList(readerInspector.getStructFieldData(row,
fields.get(10)));
assertEquals(3, list.size());
assertEquals(100000000, in.get(inner.getStructFieldData(list.get(0),
inFields.get(0))));
assertEquals("cat", st.getPrimitiveJavaObject(inner.getStructFieldData
(list.get(0), inFields.get(1))));
assertEquals(-100000, in.get(inner.getStructFieldData(list.get(1),
inFields.get(0))));
assertEquals("in", st.getPrimitiveJavaObject(inner.getStructFieldData
(list.get(1), inFields.get(1))));
assertEquals(1234, in.get(inner.getStructFieldData(list.get(2),
inFields.get(0))));
assertEquals("hat", st.getPrimitiveJavaObject(inner.getStructFieldData
(list.get(2), inFields.get(1))));
map = ma.getMap(readerInspector.getStructFieldData(row,
fields.get(11)));
assertEquals(2, map.size());
boolean[] found = new boolean[2];
for(Object key: map.keySet()) {
String str = mk.getPrimitiveJavaObject(key);
if (str.equals("chani")) {
assertEquals(false, found[0]);
assertEquals(5, in.get(inner.getStructFieldData(map.get(key),
inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(
inner.getStructFieldData(map.get(key), inFields.get(1))));
found[0] = true;
} else if (str.equals("mauddib")) {
assertEquals(false, found[1]);
assertEquals(1, in.get(inner.getStructFieldData(map.get(key),
inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(
inner.getStructFieldData(map.get(key), inFields.get(1))));
found[1] = true;
} else {
throw new IllegalArgumentException("Unknown key " + str);
}
}
assertEquals(true, found[0]);
assertEquals(true, found[1]);
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
@Test
public void columnProjection() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 1000);
Random r1 = new Random(1);
Random r2 = new Random(2);
int x;
int minInt=0, maxInt=0;
String y;
String minStr = null, maxStr = null;
for(int i=0; i < 21000; ++i) {
x = r1.nextInt();
y = Long.toHexString(r2.nextLong());
if (i == 0 || x < minInt) {
minInt = x;
}
if (i == 0 || x > maxInt) {
maxInt = x;
}
if (i == 0 || y.compareTo(minStr) < 0) {
minStr = y;
}
if (i == 0 || y.compareTo(maxStr) > 0) {
maxStr = y;
}
writer.addRow(inner(x, y));
}
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
// check out the statistics
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(3, stats.length);
for(ColumnStatistics s: stats) {
assertEquals(21000, s.getNumberOfValues());
if (s instanceof IntegerColumnStatistics) {
assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
} else if (s instanceof StringColumnStatistics) {
assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
}
}
// check out the types
List<OrcProto.Type> types = reader.getTypes();
assertEquals(3, types.size());
assertEquals(OrcProto.Type.Kind.STRUCT, types.get(0).getKind());
assertEquals(2, types.get(0).getSubtypesCount());
assertEquals(1, types.get(0).getSubtypes(0));
assertEquals(2, types.get(0).getSubtypes(1));
assertEquals(OrcProto.Type.Kind.INT, types.get(1).getKind());
assertEquals(0, types.get(1).getSubtypesCount());
assertEquals(OrcProto.Type.Kind.STRING, types.get(2).getKind());
assertEquals(0, types.get(2).getSubtypesCount());
// read the contents and make sure they match
RecordReader rows1 = reader.rows(new boolean[]{true, true, false});
RecordReader rows2 = reader.rows(new boolean[]{true, false, true});
r1 = new Random(1);
r2 = new Random(2);
OrcStruct row1 = null;
OrcStruct row2 = null;
for(int i = 0; i < 21000; ++i) {
assertEquals(true, rows1.hasNext());
assertEquals(true, rows2.hasNext());
row1 = (OrcStruct) rows1.next(row1);
row2 = (OrcStruct) rows2.next(row2);
assertEquals(r1.nextInt(), ((IntWritable) row1.getFieldValue(0)).get());
assertEquals(Long.toHexString(r2.nextLong()),
row2.getFieldValue(1).toString());
}
assertEquals(false, rows1.hasNext());
assertEquals(false, rows2.hasNext());
rows1.close();
rows2.close();
}
@Test
public void emptyFile() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
assertEquals(false, reader.rows(null).hasNext());
assertEquals(CompressionKind.NONE, reader.getCompression());
assertEquals(0, reader.getNumberOfRows());
assertEquals(0, reader.getCompressionSize());
assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
assertEquals(3, reader.getContentLength());
assertEquals(false, reader.getStripes().iterator().hasNext());
}
@Test
public void metaData() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128));
writer.addUserMetadata("clobber", byteBuf(1,2,3));
writer.addUserMetadata("clobber", byteBuf(4,3,2,1));
ByteBuffer bigBuf = ByteBuffer.allocate(40000);
Random random = new Random(0);
random.nextBytes(bigBuf.array());
writer.addUserMetadata("big", bigBuf);
bigBuf.position(0);
writer.addRow(new BigRow(true, (byte) 127, (short) 1024, 42,
42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null,
null, null, null, null));
writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19));
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
assertEquals(byteBuf(5,7,11,13,17,19), reader.getMetadataValue("clobber"));
assertEquals(byteBuf(1,2,3,4,5,6,7,-1,-2,127,-128),
reader.getMetadataValue("my.meta"));
assertEquals(bigBuf, reader.getMetadataValue("big"));
try {
reader.getMetadataValue("unknown");
assertTrue(false);
} catch (IllegalArgumentException iae) {
// PASS
}
int i = 0;
for(String key: reader.getMetadataKeys()) {
if ("my.meta".equals(key) ||
"clobber".equals(key) ||
"big".equals(key)) {
i += 1;
} else {
throw new IllegalArgumentException("unknown key " + key);
}
}
assertEquals(3, i);
}
/**
* We test union, timestamp, and decimal separately since we need to make the
* object inspector manually. (The Hive reflection-based doesn't handle
* them properly.)
*/
@Test
public void testUnionAndTimestamp() throws Exception {
List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT).
addFieldNames("time").addFieldNames("union").addFieldNames("decimal").
addSubtypes(1).addSubtypes(2).addSubtypes(5).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.TIMESTAMP).
build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.UNION).
addSubtypes(3).addSubtypes(4).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).
build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).
build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.DECIMAL).
build());
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = OrcStruct.createObjectInspector(0, types);
}
HiveDecimal maxValue = new HiveDecimal("100000000000000000000");
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.NONE, 100, 10000);
OrcStruct row = new OrcStruct(3);
OrcUnion union = new OrcUnion();
row.setFieldValue(1, union);
row.setFieldValue(0, Timestamp.valueOf("2000-03-12 15:00:00"));
HiveDecimal value = new HiveDecimal("12345678.6547456");
row.setFieldValue(2, value);
union.set((byte) 0, new IntWritable(42));
writer.addRow(row);
row.setFieldValue(0, Timestamp.valueOf("2000-03-20 12:00:00.123456789"));
union.set((byte) 1, new Text("hello"));
value = new HiveDecimal("-5643.234");
row.setFieldValue(2, value);
writer.addRow(row);
row.setFieldValue(0, null);
row.setFieldValue(1, null);
row.setFieldValue(2, null);
writer.addRow(row);
row.setFieldValue(1, union);
union.set((byte) 0, null);
writer.addRow(row);
union.set((byte) 1, null);
writer.addRow(row);
union.set((byte) 0, new IntWritable(200000));
row.setFieldValue(0, Timestamp.valueOf("1900-01-01 00:00:00"));
value = new HiveDecimal("100000000000000000000");
row.setFieldValue(2, value);
writer.addRow(row);
Random rand = new Random(42);
for(int i=1900; i < 2200; ++i) {
row.setFieldValue(0, Timestamp.valueOf(i + "-05-05 12:34:56." + i));
if ((i & 1) == 0) {
union.set((byte) 0, new IntWritable(i*i));
} else {
union.set((byte) 1, new Text(new Integer(i*i).toString()));
}
value = new HiveDecimal(new BigInteger(118, rand),
rand.nextInt(36));
row.setFieldValue(2, value);
if (maxValue.compareTo(value) < 0) {
maxValue = value;
}
writer.addRow(row);
}
// let's add a lot of constant rows to test the rle
row.setFieldValue(0, null);
union.set((byte) 0, new IntWritable(1732050807));
row.setFieldValue(2, null);
for(int i=0; i < 5000; ++i) {
writer.addRow(row);
}
union.set((byte) 0, new IntWritable(0));
writer.addRow(row);
union.set((byte) 0, new IntWritable(10));
writer.addRow(row);
union.set((byte) 0, new IntWritable(138));
writer.addRow(row);
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
assertEquals(5309, reader.getNumberOfRows());
DecimalColumnStatistics stats =
(DecimalColumnStatistics) reader.getStatistics()[5];
assertEquals(303, stats.getNumberOfValues());
assertEquals(new HiveDecimal("-5643.234"), stats.getMinimum());
assertEquals(maxValue, stats.getMaximum());
assertEquals(null, stats.getSum());
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for(StripeInformation stripe: reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength() +
stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() +
stripe.getDataLength() + stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
assertEquals(reader.getContentLength(), currentOffset);
RecordReader rows = reader.rows(null);
assertEquals(0, rows.getRowNumber());
assertEquals(0.0, rows.getProgress(), 0.000001);
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(null);
inspector = reader.getObjectInspector();
assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal>",
inspector.getTypeName());
assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"),
row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(new IntWritable(42), union.getObject());
assertEquals(new HiveDecimal("12345678.6547456"), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimal("-5643.234"), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(null, row.getFieldValue(1));
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(Timestamp.valueOf("1900-01-01 00:00:00"),
row.getFieldValue(0));
assertEquals(new IntWritable(200000), union.getObject());
assertEquals(new HiveDecimal("100000000000000000000"),
row.getFieldValue(2));
rand = new Random(42);
for(int i=1900; i < 2200; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i),
row.getFieldValue(0));
if ((i & 1) == 0) {
assertEquals(0, union.getTag());
assertEquals(new IntWritable(i*i), union.getObject());
} else {
assertEquals(1, union.getTag());
assertEquals(new Text(new Integer(i*i).toString()), union.getObject());
}
assertEquals(new HiveDecimal(new BigInteger(118, rand),
rand.nextInt(36)), row.getFieldValue(2));
}
for(int i=0; i < 5000; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(1732050807), union.getObject());
}
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(0), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(10), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(138), union.getObject());
assertEquals(false, rows.hasNext());
assertEquals(1.0, rows.getProgress(), 0.00001);
assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
rows.seekToRow(1);
row = (OrcStruct) rows.next(row);
assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"),
row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimal("-5643.234"), row.getFieldValue(2));
rows.close();
}
/**
* Read and write a randomly generated snappy file.
* @throws Exception
*/
@Test
public void testSnappy() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
1000, CompressionKind.SNAPPY, 100, 10000);
Random rand = new Random(12);
for(int i=0; i < 10000; ++i) {
writer.addRow(new InnerStruct(rand.nextInt(),
Integer.toHexString(rand.nextInt())));
}
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
RecordReader rows = reader.rows(null);
rand = new Random(12);
OrcStruct row = null;
for(int i=0; i < 10000; ++i) {
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(row);
assertEquals(rand.nextInt(), ((IntWritable) row.getFieldValue(0)).get());
assertEquals(Integer.toHexString(rand.nextInt()),
row.getFieldValue(1).toString());
}
assertEquals(false, rows.hasNext());
rows.close();
}
/**
* Read and write a randomly generated snappy file.
* @throws Exception
*/
@Test
public void testWithoutIndex() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
5000, CompressionKind.SNAPPY, 1000, 0);
Random rand = new Random(24);
for(int i=0; i < 10000; ++i) {
InnerStruct row = new InnerStruct(rand.nextInt(),
Integer.toBinaryString(rand.nextInt()));
for(int j=0; j< 5; ++j) {
writer.addRow(row);
}
}
writer.close();
Reader reader = OrcFile.createReader(fs, testFilePath);
assertEquals(50000, reader.getNumberOfRows());
assertEquals(0, reader.getRowIndexStride());
StripeInformation stripe = reader.getStripes().iterator().next();
assertEquals(true, stripe.getDataLength() != 0);
assertEquals(0, stripe.getIndexLength());
RecordReader rows = reader.rows(null);
rand = new Random(24);
OrcStruct row = null;
for(int i=0; i < 10000; ++i) {
int intVal = rand.nextInt();
String strVal = Integer.toBinaryString(rand.nextInt());
for(int j=0; j < 5; ++j) {
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(row);
assertEquals(intVal, ((IntWritable) row.getFieldValue(0)).get());
assertEquals(strVal, row.getFieldValue(1).toString());
}
}
assertEquals(false, rows.hasNext());
rows.close();
}
@Test
public void testSeek() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
200000, CompressionKind.ZLIB, 65536, 1000);
Random rand = new Random(42);
final int COUNT=32768;
long[] intValues= new long[COUNT];
double[] doubleValues = new double[COUNT];
String[] stringValues = new String[COUNT];
BytesWritable[] byteValues = new BytesWritable[COUNT];
String[] words = new String[128];
for(int i=0; i < words.length; ++i) {
words[i] = Integer.toHexString(rand.nextInt());
}
for(int i=0; i < COUNT/2; ++i) {
intValues[2*i] = rand.nextLong();
intValues[2*i+1] = intValues[2*i];
stringValues[2*i] = words[rand.nextInt(words.length)];
stringValues[2*i+1] = stringValues[2*i];
}
for(int i=0; i < COUNT; ++i) {
doubleValues[i] = rand.nextDouble();
byte[] buf = new byte[20];
rand.nextBytes(buf);
byteValues[i] = new BytesWritable(buf);
}
for(int i=0; i < COUNT; ++i) {
writer.addRow(createRandomRow(intValues, doubleValues, stringValues,
byteValues, words, i));
}
writer.close();
writer = null;
Reader reader = OrcFile.createReader(fs, testFilePath);
assertEquals(COUNT, reader.getNumberOfRows());
RecordReader rows = reader.rows(null);
OrcStruct row = null;
for(int i=COUNT-1; i >= 0; --i) {
rows.seekToRow(i);
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues,
stringValues, byteValues, words, i);
assertEquals(expected.boolean1.booleanValue(),
((BooleanWritable) row.getFieldValue(0)).get());
assertEquals(expected.byte1.byteValue(),
((ByteWritable) row.getFieldValue(1)).get());
assertEquals(expected.short1.shortValue(),
((ShortWritable) row.getFieldValue(2)).get());
assertEquals(expected.int1.intValue(),
((IntWritable) row.getFieldValue(3)).get());
assertEquals(expected.long1.longValue(),
((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.float1.floatValue(),
((FloatWritable) row.getFieldValue(5)).get(), 0.0001);
assertEquals(expected.double1.doubleValue(),
((DoubleWritable) row.getFieldValue(6)).get(), 0.0001);
assertEquals(expected.bytes1, row.getFieldValue(7));
assertEquals(expected.string1, row.getFieldValue(8));
List<InnerStruct> expectedList = expected.middle.list;
List<OrcStruct> actualList =
(List) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0);
compareList(expectedList, actualList);
compareList(expected.list, (List) row.getFieldValue(10));
}
rows.close();
Iterator<StripeInformation> stripeIterator =
reader.getStripes().iterator();
long offsetOfStripe2 = 0;
long offsetOfStripe4 = 0;
long lastRowOfStripe2 = 0;
for(int i = 0; i < 5; ++i) {
StripeInformation stripe = stripeIterator.next();
if (i < 2) {
lastRowOfStripe2 += stripe.getNumberOfRows();
} else if (i == 2) {
offsetOfStripe2 = stripe.getOffset();
lastRowOfStripe2 += stripe.getNumberOfRows() - 1;
} else if (i == 4) {
offsetOfStripe4 = stripe.getOffset();
}
}
boolean[] columns = new boolean[reader.getStatistics().length];
columns[5] = true; // long colulmn
columns[9] = true; // text column
rows = reader.rows(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2,
columns);
rows.seekToRow(lastRowOfStripe2);
for(int i = 0; i < 2; ++i) {
row = (OrcStruct) rows.next(row);
BigRow expected = createRandomRow(intValues, doubleValues,
stringValues, byteValues, words,
(int) (lastRowOfStripe2 + i));
assertEquals(expected.long1.longValue(),
((LongWritable) row.getFieldValue(4)).get());
assertEquals(expected.string1, row.getFieldValue(8));
}
rows.close();
}
private void compareInner(InnerStruct expect,
OrcStruct actual) throws Exception {
if (expect == null || actual == null) {
assertEquals(expect, actual);
} else {
assertEquals(expect.int1, ((IntWritable) actual.getFieldValue(0)).get());
assertEquals(expect.string1, actual.getFieldValue(1));
}
}
private void compareList(List<InnerStruct> expect,
List<OrcStruct> actual) throws Exception {
assertEquals(expect.size(), actual.size());
for(int j=0; j < expect.size(); ++j) {
compareInner(expect.get(j), actual.get(j));
}
}
private BigRow createRandomRow(long[] intValues, double[] doubleValues,
String[] stringValues,
BytesWritable[] byteValues,
String[] words, int i) {
InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]);
InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32),
words[i % words.length] + "-x");
return new BigRow((intValues[i] & 1) == 0, (byte) intValues[i],
(short) intValues[i], (int) intValues[i], intValues[i],
(float) doubleValues[i], doubleValues[i], byteValues[i],stringValues[i],
new MiddleStruct(inner, inner2), list(), map(inner,inner2));
}
private static class MyMemoryManager extends MemoryManager {
final long totalSpace;
double rate;
Path path = null;
long lastAllocation = 0;
int rows = 0;
MemoryManager.Callback callback;
MyMemoryManager(Configuration conf, long totalSpace, double rate) {
super(conf);
this.totalSpace = totalSpace;
this.rate = rate;
}
@Override
void addWriter(Path path, long requestedAllocation,
MemoryManager.Callback callback) {
this.path = path;
this.lastAllocation = requestedAllocation;
this.callback = callback;
}
@Override
synchronized void removeWriter(Path path) {
this.path = null;
this.lastAllocation = 0;
}
@Override
long getTotalMemoryPool() {
return totalSpace;
}
@Override
double getAllocationScale() {
return rate;
}
@Override
void addedRow() throws IOException {
if (++rows % 100 == 0) {
callback.checkMemory(rate);
}
}
}
@Test
public void testMemoryManagement() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(InnerStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
MyMemoryManager memory = new MyMemoryManager(conf, 10000, 0.1);
Writer writer = new WriterImpl(fs, testFilePath, inspector,
50000, CompressionKind.NONE, 100, 0, memory);
assertEquals(testFilePath, memory.path);
for(int i=0; i < 2500; ++i) {
writer.addRow(new InnerStruct(i*300, Integer.toHexString(10*i)));
}
writer.close();
assertEquals(null, memory.path);
Reader reader = OrcFile.createReader(fs, testFilePath);
int i = 0;
for(StripeInformation stripe: reader.getStripes()) {
i += 1;
assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(),
stripe.getDataLength() < 5000);
}
assertEquals(25, i);
assertEquals(2500, reader.getNumberOfRows());
}
}