| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.orc; |
| |
| import org.apache.orc.impl.InStream; |
| import org.apache.orc.impl.KeyProvider; |
| import org.apache.orc.impl.MemoryManagerImpl; |
| import org.apache.orc.impl.OrcCodecPool; |
| |
| import org.apache.orc.impl.WriterImpl; |
| |
| import org.apache.orc.OrcFile.WriterOptions; |
| |
| import com.google.common.collect.Lists; |
| |
| import org.apache.orc.impl.ReaderImpl; |
| import org.apache.orc.impl.reader.ReaderEncryption; |
| import org.apache.orc.impl.reader.StripePlanner; |
| import org.junit.Assert; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.hive.common.type.HiveDecimal; |
| import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; |
| import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; |
| import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; |
| import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; |
| import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; |
| import org.apache.hadoop.hive.serde2.io.DateWritable; |
| import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; |
| import org.apache.hadoop.io.BytesWritable; |
| import org.apache.hadoop.io.Text; |
| import org.apache.orc.impl.DataReaderProperties; |
| import org.apache.orc.impl.OrcIndex; |
| import org.apache.orc.impl.RecordReaderImpl; |
| import org.apache.orc.impl.RecordReaderUtils; |
| import org.junit.Assume; |
| import org.junit.Before; |
| import org.junit.Rule; |
| import org.junit.Test; |
| import org.junit.rules.TestName; |
| import org.junit.runner.RunWith; |
| import org.junit.runners.Parameterized; |
| import org.mockito.Mockito; |
| |
| import javax.xml.bind.DatatypeConverter; |
| import java.io.File; |
| import java.io.IOException; |
| import java.math.BigInteger; |
| import java.net.URL; |
| import java.nio.ByteBuffer; |
| import java.nio.charset.StandardCharsets; |
| import java.security.KeyStore; |
| import java.security.MessageDigest; |
| import java.security.NoSuchAlgorithmException; |
| import java.sql.Date; |
| import java.sql.Timestamp; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Random; |
| import java.util.TimeZone; |
| import java.util.function.IntFunction; |
| |
| import static junit.framework.TestCase.assertNotNull; |
| import static org.junit.Assert.*; |
| |
| /** |
| * Tests for the vectorized reader and writer for ORC files. |
| */ |
| @RunWith(Parameterized.class) |
| public class TestVectorOrcFile { |
| |
| @Parameterized.Parameter |
| public OrcFile.Version fileFormat; |
| |
| @Parameterized.Parameters |
| public static Collection<Object[]> getParameters() { |
| OrcFile.Version[] params = new OrcFile.Version[]{ |
| OrcFile.Version.V_0_11, |
| OrcFile.Version.V_0_12, |
| OrcFile.Version.UNSTABLE_PRE_2_0}; |
| |
| List<Object[]> result = new ArrayList<>(); |
| for(OrcFile.Version v: params) { |
| result.add(new Object[]{v}); |
| } |
| return result; |
| } |
| |
| public static String getFileFromClasspath(String name) { |
| URL url = ClassLoader.getSystemResource(name); |
| if (url == null) { |
| throw new IllegalArgumentException("Could not find " + name); |
| } |
| return url.getPath(); |
| } |
| |
| public static class InnerStruct { |
| int int1; |
| Text string1 = new Text(); |
| InnerStruct(int int1, Text string1) { |
| this.int1 = int1; |
| this.string1.set(string1); |
| } |
| InnerStruct(int int1, String string1) { |
| this.int1 = int1; |
| this.string1.set(string1); |
| } |
| |
| public String toString() { |
| return "{" + int1 + ", " + string1 + "}"; |
| } |
| } |
| |
| public static class MiddleStruct { |
| List<InnerStruct> list = new ArrayList<InnerStruct>(); |
| |
| MiddleStruct(InnerStruct... items) { |
| list.clear(); |
| list.addAll(Arrays.asList(items)); |
| } |
| } |
| |
| private static InnerStruct inner(int i, String s) { |
| return new InnerStruct(i, s); |
| } |
| |
| private static Map<String, InnerStruct> map(InnerStruct... items) { |
| Map<String, InnerStruct> result = new HashMap<String, InnerStruct>(); |
| for(InnerStruct i: items) { |
| result.put(i.string1.toString(), i); |
| } |
| return result; |
| } |
| |
| private static List<InnerStruct> list(InnerStruct... items) { |
| List<InnerStruct> result = new ArrayList<InnerStruct>(); |
| result.addAll(Arrays.asList(items)); |
| return result; |
| } |
| |
| private static BytesWritable bytes(int... items) { |
| BytesWritable result = new BytesWritable(); |
| result.setSize(items.length); |
| for(int i=0; i < items.length; ++i) { |
| result.getBytes()[i] = (byte) items[i]; |
| } |
| return result; |
| } |
| |
| private static byte[] bytesArray(int... items) { |
| byte[] result = new byte[items.length]; |
| for(int i=0; i < items.length; ++i) { |
| result[i] = (byte) items[i]; |
| } |
| return result; |
| } |
| |
| private static ByteBuffer byteBuf(int... items) { |
| ByteBuffer result = ByteBuffer.allocate(items.length); |
| for(int item: items) { |
| result.put((byte) item); |
| } |
| result.flip(); |
| return result; |
| } |
| |
| Path workDir = new Path(System.getProperty("test.tmp.dir", |
| "target" + File.separator + "test" + File.separator + "tmp")); |
| |
| Configuration conf; |
| FileSystem fs; |
| Path testFilePath; |
| |
| @Rule |
| public TestName testCaseName = new TestName(); |
| |
| @Before |
| public void openFileSystem () throws Exception { |
| conf = new Configuration(); |
| fs = FileSystem.getLocal(conf); |
| testFilePath = new Path(workDir, "TestVectorOrcFile." + |
| testCaseName.getMethodName().replaceFirst("\\[[0-9]+\\]", "") |
| + "." + fileFormat.getName() + ".orc"); |
| fs.delete(testFilePath, false); |
| } |
| |
| @Test |
| public void testReadFormat_0_11() throws Exception { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| Path oldFilePath = |
| new Path(getFileFromClasspath("orc-file-11-format.orc")); |
| Reader reader = OrcFile.createReader(oldFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| int stripeCount = 0; |
| int rowCount = 0; |
| long currentOffset = -1; |
| for(StripeInformation stripe : reader.getStripes()) { |
| stripeCount += 1; |
| rowCount += stripe.getNumberOfRows(); |
| if (currentOffset < 0) { |
| currentOffset = stripe.getOffset() + stripe.getIndexLength() |
| + stripe.getDataLength() + stripe.getFooterLength(); |
| } else { |
| assertEquals(currentOffset, stripe.getOffset()); |
| currentOffset += stripe.getIndexLength() + stripe.getDataLength() |
| + stripe.getFooterLength(); |
| } |
| } |
| Assert.assertEquals(reader.getNumberOfRows(), rowCount); |
| assertEquals(2, stripeCount); |
| |
| // check the stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertEquals(7500, stats[1].getNumberOfValues()); |
| assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount()); |
| assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount()); |
| assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString()); |
| |
| assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum()); |
| assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum()); |
| assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined()); |
| assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum()); |
| assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", |
| stats[3].toString()); |
| |
| assertEquals(Long.MAX_VALUE, |
| ((IntegerColumnStatistics) stats[5]).getMaximum()); |
| assertEquals(Long.MAX_VALUE, |
| ((IntegerColumnStatistics) stats[5]).getMinimum()); |
| assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined()); |
| assertEquals( |
| "count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", |
| stats[5].toString()); |
| |
| assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001); |
| assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001); |
| assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), |
| 0.00001); |
| assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", |
| stats[7].toString()); |
| |
| assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString()); |
| |
| // check the inspectors |
| TypeDescription schema = reader.getSchema(); |
| assertEquals(TypeDescription.Category.STRUCT, schema.getCategory()); |
| assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," |
| + "int1:int,long1:bigint,float1:float,double1:double,bytes1:" |
| + "binary,string1:string,middle:struct<list:array<struct<int1:int," |
| + "string1:string>>>,list:array<struct<int1:int,string1:string>>," |
| + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp," |
| + "decimal1:decimal(38,10)>", schema.toString()); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| |
| RecordReader rows = reader.rows(); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| |
| // check the contents of the first row |
| assertEquals(false, getBoolean(batch, 0)); |
| assertEquals(1, getByte(batch, 0)); |
| assertEquals(1024, getShort(batch, 0)); |
| assertEquals(65536, getInt(batch, 0)); |
| assertEquals(Long.MAX_VALUE, getLong(batch, 0)); |
| assertEquals(1.0, getFloat(batch, 0), 0.00001); |
| assertEquals(-15.0, getDouble(batch, 0), 0.00001); |
| assertEquals(bytes(0, 1, 2, 3, 4), getBinary(batch, 0)); |
| assertEquals("hi", getText(batch, 0).toString()); |
| |
| StructColumnVector middle = (StructColumnVector) batch.cols[9]; |
| ListColumnVector midList = (ListColumnVector) middle.fields[0]; |
| StructColumnVector midListStruct = (StructColumnVector) midList.child; |
| LongColumnVector midListInt = (LongColumnVector) midListStruct.fields[0]; |
| BytesColumnVector midListStr = (BytesColumnVector) midListStruct.fields[1]; |
| ListColumnVector list = (ListColumnVector) batch.cols[10]; |
| StructColumnVector listStruct = (StructColumnVector) list.child; |
| LongColumnVector listInts = (LongColumnVector) listStruct.fields[0]; |
| BytesColumnVector listStrs = (BytesColumnVector) listStruct.fields[1]; |
| MapColumnVector map = (MapColumnVector) batch.cols[11]; |
| BytesColumnVector mapKey = (BytesColumnVector) map.keys; |
| StructColumnVector mapValue = (StructColumnVector) map.values; |
| LongColumnVector mapValueInts = (LongColumnVector) mapValue.fields[0]; |
| BytesColumnVector mapValueStrs = (BytesColumnVector) mapValue.fields[1]; |
| TimestampColumnVector timestamp = (TimestampColumnVector) batch.cols[12]; |
| DecimalColumnVector decs = (DecimalColumnVector) batch.cols[13]; |
| |
| assertEquals(false, middle.isNull[0]); |
| assertEquals(2, midList.lengths[0]); |
| int start = (int) midList.offsets[0]; |
| assertEquals(1, midListInt.vector[start]); |
| assertEquals("bye", midListStr.toString(start)); |
| assertEquals(2, midListInt.vector[start + 1]); |
| assertEquals("sigh", midListStr.toString(start + 1)); |
| |
| assertEquals(2, list.lengths[0]); |
| start = (int) list.offsets[0]; |
| assertEquals(3, listInts.vector[start]); |
| assertEquals("good", listStrs.toString(start)); |
| assertEquals(4, listInts.vector[start + 1]); |
| assertEquals("bad", listStrs.toString(start + 1)); |
| assertEquals(0, map.lengths[0]); |
| assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"), |
| timestamp.asScratchTimestamp(0)); |
| assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")), |
| decs.vector[0]); |
| |
| // check the contents of row 7499 |
| rows.seekToRow(7499); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(true, getBoolean(batch, 0)); |
| assertEquals(100, getByte(batch, 0)); |
| assertEquals(2048, getShort(batch, 0)); |
| assertEquals(65536, getInt(batch, 0)); |
| assertEquals(Long.MAX_VALUE, getLong(batch, 0)); |
| assertEquals(2.0, getFloat(batch, 0), 0.00001); |
| assertEquals(-5.0, getDouble(batch, 0), 0.00001); |
| assertEquals(bytes(), getBinary(batch, 0)); |
| assertEquals("bye", getText(batch, 0).toString()); |
| assertEquals(false, middle.isNull[0]); |
| assertEquals(2, midList.lengths[0]); |
| start = (int) midList.offsets[0]; |
| assertEquals(1, midListInt.vector[start]); |
| assertEquals("bye", midListStr.toString(start)); |
| assertEquals(2, midListInt.vector[start + 1]); |
| assertEquals("sigh", midListStr.toString(start + 1)); |
| assertEquals(3, list.lengths[0]); |
| start = (int) list.offsets[0]; |
| assertEquals(100000000, listInts.vector[start]); |
| assertEquals("cat", listStrs.toString(start)); |
| assertEquals(-100000, listInts.vector[start + 1]); |
| assertEquals("in", listStrs.toString(start + 1)); |
| assertEquals(1234, listInts.vector[start + 2]); |
| assertEquals("hat", listStrs.toString(start + 2)); |
| assertEquals(2, map.lengths[0]); |
| start = (int) map.offsets[0]; |
| assertEquals("chani", mapKey.toString(start)); |
| assertEquals(5, mapValueInts.vector[start]); |
| assertEquals("chani", mapValueStrs.toString(start)); |
| assertEquals("mauddib", mapKey.toString(start + 1)); |
| assertEquals(1, mapValueInts.vector[start + 1]); |
| assertEquals("mauddib", mapValueStrs.toString(start + 1)); |
| assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"), |
| timestamp.asScratchTimestamp(0)); |
| assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547457")), |
| decs.vector[0]); |
| |
| // handle the close up |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| @Test |
| public void testTimestamp() throws Exception { |
| TypeDescription schema = TypeDescription.createTimestamp(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) |
| .bufferSize(10000).version(fileFormat)); |
| List<Timestamp> tslist = Lists.newArrayList(); |
| tslist.add(Timestamp.valueOf("2037-01-01 00:00:00.000999")); |
| tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.000000222")); |
| tslist.add(Timestamp.valueOf("1999-01-01 00:00:00.999999999")); |
| tslist.add(Timestamp.valueOf("1995-01-01 00:00:00.688888888")); |
| tslist.add(Timestamp.valueOf("2002-01-01 00:00:00.1")); |
| tslist.add(Timestamp.valueOf("2010-03-02 00:00:00.000009001")); |
| tslist.add(Timestamp.valueOf("2005-01-01 00:00:00.000002229")); |
| tslist.add(Timestamp.valueOf("2006-01-01 00:00:00.900203003")); |
| tslist.add(Timestamp.valueOf("2003-01-01 00:00:00.800000007")); |
| tslist.add(Timestamp.valueOf("1996-08-02 00:00:00.723100809")); |
| tslist.add(Timestamp.valueOf("1998-11-02 00:00:00.857340643")); |
| tslist.add(Timestamp.valueOf("2008-10-02 00:00:00")); |
| |
| VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024); |
| TimestampColumnVector vec = new TimestampColumnVector(1024); |
| batch.cols[0] = vec; |
| batch.reset(); |
| batch.size = tslist.size(); |
| for (int i=0; i < tslist.size(); ++i) { |
| Timestamp ts = tslist.get(i); |
| vec.set(i, ts); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| TimestampColumnVector timestamps = (TimestampColumnVector) batch.cols[0]; |
| int idx = 0; |
| while (rows.nextBatch(batch)) { |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals(tslist.get(idx++).getNanos(), |
| timestamps.asScratchTimestamp(r).getNanos()); |
| } |
| } |
| Assert.assertEquals(tslist.size(), rows.getRowNumber()); |
| assertEquals(0, writer.getSchema().getMaximumId()); |
| boolean[] expected = new boolean[] {false}; |
| boolean[] included = OrcUtils.includeColumns("", writer.getSchema()); |
| assertEquals(true, Arrays.equals(expected, included)); |
| } |
| |
| @Test |
| public void testStringAndBinaryStatistics() throws Exception { |
| |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("bytes1", TypeDescription.createBinary()) |
| .addField("string1", TypeDescription.createString()); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(100000) |
| .bufferSize(10000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 4; |
| BytesColumnVector field1 = (BytesColumnVector) batch.cols[0]; |
| BytesColumnVector field2 = (BytesColumnVector) batch.cols[1]; |
| field1.setVal(0, bytesArray(0, 1, 2, 3, 4)); |
| field1.setVal(1, bytesArray(0, 1, 2, 3)); |
| field1.setVal(2, bytesArray(0, 1, 2, 3, 4, 5)); |
| field1.noNulls = false; |
| field1.isNull[3] = true; |
| field2.setVal(0, "foo".getBytes(StandardCharsets.UTF_8)); |
| field2.setVal(1, "bar".getBytes(StandardCharsets.UTF_8)); |
| field2.noNulls = false; |
| field2.isNull[2] = true; |
| field2.setVal(3, "hi".getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| writer.close(); |
| schema = writer.getSchema(); |
| assertEquals(2, schema.getMaximumId()); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| boolean[] expected = new boolean[] {false, false, true}; |
| boolean[] included = OrcUtils.includeColumns("string1", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| expected = new boolean[] {false, false, false}; |
| included = OrcUtils.includeColumns("", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| expected = new boolean[] {false, false, false}; |
| included = OrcUtils.includeColumns(null, schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| // check the stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertArrayEquals(stats, writer.getStatistics()); |
| assertEquals(4, stats[0].getNumberOfValues()); |
| assertEquals("count: 4 hasNull: false", stats[0].toString()); |
| |
| assertEquals(3, stats[1].getNumberOfValues()); |
| assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum()); |
| assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString()); |
| |
| assertEquals(3, stats[2].getNumberOfValues()); |
| assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum()); |
| assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum()); |
| assertEquals(8, ((StringColumnStatistics) stats[2]).getSum()); |
| assertEquals("count: 3 hasNull: true bytesOnDisk: " + |
| (fileFormat == OrcFile.Version.V_0_11 ? "30" : "22") + |
| " min: bar max: hi sum: 8", |
| stats[2].toString()); |
| |
| // check the inspectors |
| batch = reader.getSchema().createRowBatch(); |
| BytesColumnVector bytes = (BytesColumnVector) batch.cols[0]; |
| BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; |
| RecordReader rows = reader.rows(); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(4, batch.size); |
| |
| // check the contents of the first row |
| assertEquals(bytes(0,1,2,3,4), getBinary(bytes, 0)); |
| assertEquals("foo", strs.toString(0)); |
| |
| // check the contents of second row |
| assertEquals(bytes(0,1,2,3), getBinary(bytes, 1)); |
| assertEquals("bar", strs.toString(1)); |
| |
| // check the contents of third row |
| assertEquals(bytes(0,1,2,3,4,5), getBinary(bytes, 2)); |
| assertNull(strs.toString(2)); |
| |
| // check the contents of fourth row |
| assertNull(getBinary(bytes, 3)); |
| assertEquals("hi", strs.toString(3)); |
| |
| // handle the close up |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| @Test |
| public void testHiveDecimalStatsAllNulls() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("dec1", TypeDescription.createDecimal()); |
| |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) |
| .bufferSize(10000).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 4; |
| DecimalColumnVector field1 = (DecimalColumnVector) batch.cols[0]; |
| field1.noNulls = false; |
| field1.isNull[0] = true; |
| field1.isNull[1] = true; |
| field1.isNull[2] = true; |
| field1.isNull[3] = true; |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| // check the stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| Assert.assertEquals(4, stats[0].getNumberOfValues()); |
| Assert.assertEquals(0, stats[1].getNumberOfValues()); |
| Assert.assertEquals(true, stats[1].hasNull()); |
| Assert.assertNull(((DecimalColumnStatistics)stats[1]).getMinimum()); |
| Assert.assertNull(((DecimalColumnStatistics)stats[1]).getMaximum()); |
| Assert.assertEquals(new HiveDecimalWritable(0).getHiveDecimal(), ((DecimalColumnStatistics)stats[1]).getSum()); |
| } |
| |
| @Test |
| public void testStripeLevelStats() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString("struct<int1:int,string1:string>"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(100000) |
| .bufferSize(10000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1000; |
| LongColumnVector field1 = (LongColumnVector) batch.cols[0]; |
| BytesColumnVector field2 = (BytesColumnVector) batch.cols[1]; |
| field1.isRepeating = true; |
| field2.isRepeating = true; |
| for (int b = 0; b < 11; b++) { |
| if (b >= 5) { |
| if (b >= 10) { |
| field1.vector[0] = 3; |
| field2.setVal(0, "three".getBytes(StandardCharsets.UTF_8)); |
| } else { |
| field1.vector[0] = 2; |
| field2.setVal(0, "two".getBytes(StandardCharsets.UTF_8)); |
| } |
| } else { |
| field1.vector[0] = 1; |
| field2.setVal(0, "one".getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| } |
| |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| schema = writer.getSchema(); |
| assertEquals(2, schema.getMaximumId()); |
| boolean[] expected = new boolean[] {false, true, false}; |
| boolean[] included = OrcUtils.includeColumns("int1", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| List<StripeStatistics> stats = reader.getStripeStatistics(); |
| int numStripes = stats.size(); |
| assertEquals(3, numStripes); |
| StripeStatistics ss1 = stats.get(0); |
| StripeStatistics ss2 = stats.get(1); |
| StripeStatistics ss3 = stats.get(2); |
| |
| assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues()); |
| assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues()); |
| assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues()); |
| |
| assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues()); |
| assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues()); |
| assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues()); |
| assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum()); |
| assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum()); |
| assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum()); |
| assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum()); |
| assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum()); |
| assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum()); |
| assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum()); |
| assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum()); |
| assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum()); |
| |
| assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues()); |
| assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues()); |
| assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues()); |
| assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum()); |
| assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum()); |
| assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum()); |
| assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum()); |
| assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum()); |
| assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum()); |
| assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum()); |
| assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum()); |
| assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum()); |
| |
| RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows(); |
| OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex(); |
| assertEquals(3, index.length); |
| List<OrcProto.RowIndexEntry> items = index[1].getEntryList(); |
| assertEquals(1, items.size()); |
| assertEquals(3, items.get(0).getPositionsCount()); |
| assertEquals(0, items.get(0).getPositions(0)); |
| assertEquals(0, items.get(0).getPositions(1)); |
| assertEquals(0, items.get(0).getPositions(2)); |
| assertEquals(1, |
| items.get(0).getStatistics().getIntStatistics().getMinimum()); |
| index = recordReader.readRowIndex(1, null, null).getRowGroupIndex(); |
| assertEquals(3, index.length); |
| items = index[1].getEntryList(); |
| assertEquals(2, |
| items.get(0).getStatistics().getIntStatistics().getMaximum()); |
| } |
| |
| private static void setInner(StructColumnVector inner, int rowId, |
| int i, String value) { |
| ((LongColumnVector) inner.fields[0]).vector[rowId] = i; |
| if (value != null) { |
| ((BytesColumnVector) inner.fields[1]).setVal(rowId, value.getBytes(StandardCharsets.UTF_8)); |
| } else { |
| inner.fields[1].isNull[rowId] = true; |
| inner.fields[1].noNulls = false; |
| } |
| } |
| |
| private static void checkInner(StructColumnVector inner, int rowId, |
| int rowInBatch, int i, String value) { |
| assertEquals("row " + rowId, i, |
| ((LongColumnVector) inner.fields[0]).vector[rowInBatch]); |
| if (value != null) { |
| assertEquals("row " + rowId, value, |
| ((BytesColumnVector) inner.fields[1]).toString(rowInBatch)); |
| } else { |
| assertEquals("row " + rowId, true, inner.fields[1].isNull[rowInBatch]); |
| assertEquals("row " + rowId, false, inner.fields[1].noNulls); |
| } |
| } |
| |
| private static void setInnerList(ListColumnVector list, int rowId, |
| List<InnerStruct> value) { |
| if (value != null) { |
| if (list.childCount + value.size() > list.child.isNull.length) { |
| list.child.ensureSize(list.childCount * 2, true); |
| } |
| list.lengths[rowId] = value.size(); |
| list.offsets[rowId] = list.childCount; |
| for (int i = 0; i < list.lengths[rowId]; ++i) { |
| InnerStruct inner = value.get(i); |
| setInner((StructColumnVector) list.child, i + list.childCount, |
| inner.int1, inner.string1.toString()); |
| } |
| list.childCount += value.size(); |
| } else { |
| list.isNull[rowId] = true; |
| list.noNulls = false; |
| } |
| } |
| |
| private static void checkInnerList(ListColumnVector list, int rowId, |
| int rowInBatch, List<InnerStruct> value) { |
| if (value != null) { |
| assertEquals("row " + rowId, value.size(), list.lengths[rowInBatch]); |
| int start = (int) list.offsets[rowInBatch]; |
| for (int i = 0; i < list.lengths[rowInBatch]; ++i) { |
| InnerStruct inner = value.get(i); |
| checkInner((StructColumnVector) list.child, rowId, i + start, |
| inner.int1, inner.string1.toString()); |
| } |
| list.childCount += value.size(); |
| } else { |
| assertEquals("row " + rowId, true, list.isNull[rowInBatch]); |
| assertEquals("row " + rowId, false, list.noNulls); |
| } |
| } |
| |
| private static void setInnerMap(MapColumnVector map, int rowId, |
| Map<String, InnerStruct> value) { |
| if (value != null) { |
| if (map.childCount >= map.keys.isNull.length) { |
| map.keys.ensureSize(map.childCount * 2, true); |
| map.values.ensureSize(map.childCount * 2, true); |
| } |
| map.lengths[rowId] = value.size(); |
| int offset = map.childCount; |
| map.offsets[rowId] = offset; |
| |
| for (Map.Entry<String, InnerStruct> entry : value.entrySet()) { |
| ((BytesColumnVector) map.keys).setVal(offset, entry.getKey().getBytes(StandardCharsets.UTF_8)); |
| InnerStruct inner = entry.getValue(); |
| setInner((StructColumnVector) map.values, offset, inner.int1, |
| inner.string1.toString()); |
| offset += 1; |
| } |
| map.childCount = offset; |
| } else { |
| map.isNull[rowId] = true; |
| map.noNulls = false; |
| } |
| } |
| |
| private static void checkInnerMap(MapColumnVector map, int rowId, |
| int rowInBatch, |
| Map<String, InnerStruct> value) { |
| if (value != null) { |
| assertEquals("row " + rowId, value.size(), map.lengths[rowInBatch]); |
| int offset = (int) map.offsets[rowInBatch]; |
| for(int i=0; i < value.size(); ++i) { |
| String key = ((BytesColumnVector) map.keys).toString(offset + i); |
| InnerStruct expected = value.get(key); |
| checkInner((StructColumnVector) map.values, rowId, offset + i, |
| expected.int1, expected.string1.toString()); |
| } |
| } else { |
| assertEquals("row " + rowId, true, map.isNull[rowId]); |
| assertEquals("row " + rowId, false, map.noNulls); |
| } |
| } |
| |
| private static void setMiddleStruct(StructColumnVector middle, int rowId, |
| MiddleStruct value) { |
| if (value != null) { |
| setInnerList((ListColumnVector) middle.fields[0], rowId, value.list); |
| } else { |
| middle.isNull[rowId] = true; |
| middle.noNulls = false; |
| } |
| } |
| |
| private static void checkMiddleStruct(StructColumnVector middle, int rowId, |
| int rowInBatch, MiddleStruct value) { |
| if (value != null) { |
| checkInnerList((ListColumnVector) middle.fields[0], rowId, rowInBatch, |
| value.list); |
| } else { |
| assertEquals("row " + rowId, true, middle.isNull[rowInBatch]); |
| assertEquals("row " + rowId, false, middle.noNulls); |
| } |
| } |
| |
| private static void setBigRow(VectorizedRowBatch batch, int rowId, |
| Boolean b1, Byte b2, Short s1, |
| Integer i1, Long l1, Float f1, |
| Double d1, BytesWritable b3, String s2, |
| MiddleStruct m1, List<InnerStruct> l2, |
| Map<String, InnerStruct> m2) { |
| ((LongColumnVector) batch.cols[0]).vector[rowId] = b1 ? 1 : 0; |
| ((LongColumnVector) batch.cols[1]).vector[rowId] = b2; |
| ((LongColumnVector) batch.cols[2]).vector[rowId] = s1; |
| ((LongColumnVector) batch.cols[3]).vector[rowId] = i1; |
| ((LongColumnVector) batch.cols[4]).vector[rowId] = l1; |
| ((DoubleColumnVector) batch.cols[5]).vector[rowId] = f1; |
| ((DoubleColumnVector) batch.cols[6]).vector[rowId] = d1; |
| if (b3 != null) { |
| ((BytesColumnVector) batch.cols[7]).setVal(rowId, b3.getBytes(), 0, |
| b3.getLength()); |
| } else { |
| batch.cols[7].isNull[rowId] = true; |
| batch.cols[7].noNulls = false; |
| } |
| if (s2 != null) { |
| ((BytesColumnVector) batch.cols[8]).setVal(rowId, s2.getBytes(StandardCharsets.UTF_8)); |
| } else { |
| batch.cols[8].isNull[rowId] = true; |
| batch.cols[8].noNulls = false; |
| } |
| setMiddleStruct((StructColumnVector) batch.cols[9], rowId, m1); |
| setInnerList((ListColumnVector) batch.cols[10], rowId, l2); |
| setInnerMap((MapColumnVector) batch.cols[11], rowId, m2); |
| } |
| |
| private static void checkBigRow(VectorizedRowBatch batch, |
| int rowInBatch, |
| int rowId, |
| boolean b1, byte b2, short s1, |
| int i1, long l1, float f1, |
| double d1, BytesWritable b3, String s2, |
| MiddleStruct m1, List<InnerStruct> l2, |
| Map<String, InnerStruct> m2) { |
| assertEquals("row " + rowId, b1, getBoolean(batch, rowInBatch)); |
| assertEquals("row " + rowId, b2, getByte(batch, rowInBatch)); |
| assertEquals("row " + rowId, s1, getShort(batch, rowInBatch)); |
| assertEquals("row " + rowId, i1, getInt(batch, rowInBatch)); |
| assertEquals("row " + rowId, l1, getLong(batch, rowInBatch)); |
| assertEquals("row " + rowId, f1, getFloat(batch, rowInBatch), 0.0001); |
| assertEquals("row " + rowId, d1, getDouble(batch, rowInBatch), 0.0001); |
| if (b3 != null) { |
| BytesColumnVector bytes = (BytesColumnVector) batch.cols[7]; |
| assertEquals("row " + rowId, b3.getLength(), bytes.length[rowInBatch]); |
| for(int i=0; i < b3.getLength(); ++i) { |
| assertEquals("row " + rowId + " byte " + i, b3.getBytes()[i], |
| bytes.vector[rowInBatch][bytes.start[rowInBatch] + i]); |
| } |
| } else { |
| assertEquals("row " + rowId, true, batch.cols[7].isNull[rowInBatch]); |
| assertEquals("row " + rowId, false, batch.cols[7].noNulls); |
| } |
| if (s2 != null) { |
| assertEquals("row " + rowId, s2, getText(batch, rowInBatch).toString()); |
| } else { |
| assertEquals("row " + rowId, true, batch.cols[8].isNull[rowInBatch]); |
| assertEquals("row " + rowId, false, batch.cols[8].noNulls); |
| } |
| checkMiddleStruct((StructColumnVector) batch.cols[9], rowId, rowInBatch, |
| m1); |
| checkInnerList((ListColumnVector) batch.cols[10], rowId, rowInBatch, l2); |
| checkInnerMap((MapColumnVector) batch.cols[11], rowId, rowInBatch, m2); |
| } |
| |
| private static boolean getBoolean(VectorizedRowBatch batch, int rowId) { |
| return ((LongColumnVector) batch.cols[0]).vector[rowId] != 0; |
| } |
| |
| private static byte getByte(VectorizedRowBatch batch, int rowId) { |
| return (byte) ((LongColumnVector) batch.cols[1]).vector[rowId]; |
| } |
| |
| private static short getShort(VectorizedRowBatch batch, int rowId) { |
| return (short) ((LongColumnVector) batch.cols[2]).vector[rowId]; |
| } |
| |
| private static int getInt(VectorizedRowBatch batch, int rowId) { |
| return (int) ((LongColumnVector) batch.cols[3]).vector[rowId]; |
| } |
| |
| private static long getLong(VectorizedRowBatch batch, int rowId) { |
| return ((LongColumnVector) batch.cols[4]).vector[rowId]; |
| } |
| |
| private static float getFloat(VectorizedRowBatch batch, int rowId) { |
| return (float) ((DoubleColumnVector) batch.cols[5]).vector[rowId]; |
| } |
| |
| private static double getDouble(VectorizedRowBatch batch, int rowId) { |
| return ((DoubleColumnVector) batch.cols[6]).vector[rowId]; |
| } |
| |
| private static BytesWritable getBinary(BytesColumnVector column, int rowId) { |
| if (column.isRepeating) { |
| rowId = 0; |
| } |
| if (column.noNulls || !column.isNull[rowId]) { |
| return new BytesWritable(Arrays.copyOfRange(column.vector[rowId], |
| column.start[rowId], column.start[rowId] + column.length[rowId])); |
| } else { |
| return null; |
| } |
| } |
| |
| private static BytesWritable getBinary(VectorizedRowBatch batch, int rowId) { |
| return getBinary((BytesColumnVector) batch.cols[7], rowId); |
| } |
| |
| private static Text getText(BytesColumnVector vector, int rowId) { |
| if (vector.isRepeating) { |
| rowId = 0; |
| } |
| if (vector.noNulls || !vector.isNull[rowId]) { |
| return new Text(Arrays.copyOfRange(vector.vector[rowId], |
| vector.start[rowId], vector.start[rowId] + vector.length[rowId])); |
| } else { |
| return null; |
| } |
| } |
| |
| private static Text getText(VectorizedRowBatch batch, int rowId) { |
| return getText((BytesColumnVector) batch.cols[8], rowId); |
| } |
| |
| private static InnerStruct getInner(StructColumnVector vector, |
| int rowId) { |
| return new InnerStruct( |
| (int) ((LongColumnVector) vector.fields[0]).vector[rowId], |
| getText((BytesColumnVector) vector.fields[1], rowId)); |
| } |
| |
| private static List<InnerStruct> getList(ListColumnVector cv, |
| int rowId) { |
| if (cv.isRepeating) { |
| rowId = 0; |
| } |
| if (cv.noNulls || !cv.isNull[rowId]) { |
| List<InnerStruct> result = |
| new ArrayList<InnerStruct>((int) cv.lengths[rowId]); |
| for(long i=cv.offsets[rowId]; |
| i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) { |
| result.add(getInner((StructColumnVector) cv.child, (int) i)); |
| } |
| return result; |
| } else { |
| return null; |
| } |
| } |
| |
| private static List<InnerStruct> getMidList(VectorizedRowBatch batch, |
| int rowId) { |
| return getList((ListColumnVector) ((StructColumnVector) batch.cols[9]) |
| .fields[0], rowId); |
| } |
| |
| private static List<InnerStruct> getList(VectorizedRowBatch batch, |
| int rowId) { |
| return getList((ListColumnVector) batch.cols[10], rowId); |
| } |
| |
| private static Map<Text, InnerStruct> getMap(VectorizedRowBatch batch, |
| int rowId) { |
| MapColumnVector cv = (MapColumnVector) batch.cols[11]; |
| if (cv.isRepeating) { |
| rowId = 0; |
| } |
| if (cv.noNulls || !cv.isNull[rowId]) { |
| Map<Text, InnerStruct> result = |
| new HashMap<Text, InnerStruct>((int) cv.lengths[rowId]); |
| for(long i=cv.offsets[rowId]; |
| i < cv.offsets[rowId] + cv.lengths[rowId]; ++i) { |
| result.put(getText((BytesColumnVector) cv.keys, (int) i), |
| getInner((StructColumnVector) cv.values, (int) i)); |
| } |
| return result; |
| } else { |
| return null; |
| } |
| } |
| |
| private static TypeDescription createInnerSchema() { |
| return TypeDescription.fromString("struct<int1:int,string1:string>"); |
| } |
| |
| private static TypeDescription createComplexInnerSchema() |
| { |
| return TypeDescription.fromString("struct<int1:int," |
| + "complex:struct<int2:int,String1:string>>"); |
| } |
| |
| private static TypeDescription createBigRowSchema() { |
| return TypeDescription.createStruct() |
| .addField("boolean1", TypeDescription.createBoolean()) |
| .addField("byte1", TypeDescription.createByte()) |
| .addField("short1", TypeDescription.createShort()) |
| .addField("int1", TypeDescription.createInt()) |
| .addField("long1", TypeDescription.createLong()) |
| .addField("float1", TypeDescription.createFloat()) |
| .addField("double1", TypeDescription.createDouble()) |
| .addField("bytes1", TypeDescription.createBinary()) |
| .addField("string1", TypeDescription.createString()) |
| .addField("middle", TypeDescription.createStruct() |
| .addField("list", TypeDescription.createList(createInnerSchema()))) |
| .addField("list", TypeDescription.createList(createInnerSchema())) |
| .addField("map", TypeDescription.createMap( |
| TypeDescription.createString(), |
| createInnerSchema())); |
| } |
| |
| static void assertArrayBooleanEquals(boolean[] expected, boolean[] actual) { |
| assertEquals(expected.length, actual.length); |
| boolean diff = false; |
| for(int i=0; i < expected.length; ++i) { |
| if (expected[i] != actual[i]) { |
| System.out.println("Difference at " + i + " expected: " + expected[i] + |
| " actual: " + actual[i]); |
| diff = true; |
| } |
| } |
| assertEquals(false, diff); |
| } |
| |
| @Test |
| public void test1() throws Exception { |
| TypeDescription schema = createBigRowSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(100000) |
| .bufferSize(10000) |
| .version(fileFormat)); |
| assertEmptyStats(writer.getStatistics()); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 2; |
| setBigRow(batch, 0, false, (byte) 1, (short) 1024, 65536, |
| Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", |
| new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), |
| list(inner(3, "good"), inner(4, "bad")), |
| map()); |
| setBigRow(batch, 1, true, (byte) 100, (short) 2048, 65536, |
| Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", |
| new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), |
| list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), |
| map(inner(5, "chani"), inner(1, "mauddib"))); |
| writer.addRowBatch(batch); |
| assertEmptyStats(writer.getStatistics()); |
| writer.close(); |
| ColumnStatistics[] closeStatistics = writer.getStatistics(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| schema = writer.getSchema(); |
| assertEquals(23, schema.getMaximumId()); |
| boolean[] expected = new boolean[] {false, false, false, false, false, |
| false, false, false, false, false, |
| false, false, false, false, false, |
| false, false, false, false, false, |
| false, false, false, false}; |
| boolean[] included = OrcUtils.includeColumns("", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| expected = new boolean[] {false, true, false, false, false, |
| false, false, false, false, true, |
| true, true, true, true, true, |
| false, false, false, false, true, |
| true, true, true, true}; |
| included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema); |
| |
| assertArrayBooleanEquals(expected, included); |
| |
| expected = new boolean[] {false, true, false, false, false, |
| false, false, false, false, true, |
| true, true, true, true, true, |
| false, false, false, false, true, |
| true, true, true, true}; |
| included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema); |
| assertArrayBooleanEquals(expected, included); |
| |
| expected = new boolean[] {false, true, true, true, true, |
| true, true, true, true, true, |
| true, true, true, true, true, |
| true, true, true, true, true, |
| true, true, true, true}; |
| included = OrcUtils.includeColumns( |
| "boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map", |
| schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| // check the stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertArrayEquals(stats, closeStatistics); |
| assertEquals(2, stats[1].getNumberOfValues()); |
| assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount()); |
| assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount()); |
| assertEquals("count: 2 hasNull: false bytesOnDisk: 5 true: 1", stats[1].toString()); |
| |
| assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum()); |
| assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum()); |
| assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined()); |
| assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum()); |
| assertEquals("count: 2 hasNull: false bytesOnDisk: " + |
| (fileFormat == OrcFile.Version.V_0_11 ? "8" : "9") + |
| " min: 1024 max: 2048 sum: 3072", stats[3].toString()); |
| |
| StripeStatistics ss = reader.getStripeStatistics().get(0); |
| assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues()); |
| assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount()); |
| assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum()); |
| assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum()); |
| assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum()); |
| assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum(), 0.0001); |
| assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum(), 0.0001); |
| assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001); |
| assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0", |
| stats[7].toString()); |
| |
| assertEquals("count: 2 hasNull: false bytesOnDisk: " + |
| (fileFormat == OrcFile.Version.V_0_11 ? "20" : "14") + |
| " min: bye max: hi sum: 5", stats[9].toString()); |
| |
| // check the schema |
| TypeDescription readerSchema = reader.getSchema(); |
| assertEquals(TypeDescription.Category.STRUCT, readerSchema.getCategory()); |
| assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," |
| + "int1:int,long1:bigint,float1:float,double1:double,bytes1:" |
| + "binary,string1:string,middle:struct<list:array<struct<int1:int," |
| + "string1:string>>>,list:array<struct<int1:int,string1:string>>," |
| + "map:map<string,struct<int1:int,string1:string>>>", |
| readerSchema.toString()); |
| List<String> fieldNames = readerSchema.getFieldNames(); |
| List<TypeDescription> fieldTypes = readerSchema.getChildren(); |
| assertEquals("boolean1", fieldNames.get(0)); |
| assertEquals(TypeDescription.Category.BOOLEAN, fieldTypes.get(0).getCategory()); |
| assertEquals("byte1", fieldNames.get(1)); |
| assertEquals(TypeDescription.Category.BYTE, fieldTypes.get(1).getCategory()); |
| assertEquals("short1", fieldNames.get(2)); |
| assertEquals(TypeDescription.Category.SHORT, fieldTypes.get(2).getCategory()); |
| assertEquals("int1", fieldNames.get(3)); |
| assertEquals(TypeDescription.Category.INT, fieldTypes.get(3).getCategory()); |
| assertEquals("long1", fieldNames.get(4)); |
| assertEquals(TypeDescription.Category.LONG, fieldTypes.get(4).getCategory()); |
| assertEquals("float1", fieldNames.get(5)); |
| assertEquals(TypeDescription.Category.FLOAT, fieldTypes.get(5).getCategory()); |
| assertEquals("double1", fieldNames.get(6)); |
| assertEquals(TypeDescription.Category.DOUBLE, fieldTypes.get(6).getCategory()); |
| assertEquals("bytes1", fieldNames.get(7)); |
| assertEquals(TypeDescription.Category.BINARY, fieldTypes.get(7).getCategory()); |
| assertEquals("string1", fieldNames.get(8)); |
| assertEquals(TypeDescription.Category.STRING, fieldTypes.get(8).getCategory()); |
| assertEquals("middle", fieldNames.get(9)); |
| TypeDescription middle = fieldTypes.get(9); |
| assertEquals(TypeDescription.Category.STRUCT, middle.getCategory()); |
| TypeDescription midList = middle.getChildren().get(0); |
| assertEquals(TypeDescription.Category.LIST, midList.getCategory()); |
| TypeDescription inner = midList.getChildren().get(0); |
| assertEquals(TypeDescription.Category.STRUCT, inner.getCategory()); |
| assertEquals("int1", inner.getFieldNames().get(0)); |
| assertEquals("string1", inner.getFieldNames().get(1)); |
| |
| RecordReader rows = reader.rows(); |
| // create a new batch |
| batch = readerSchema.createRowBatch(); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(2, batch.size); |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| |
| // check the contents of the first row |
| assertEquals(false, getBoolean(batch, 0)); |
| assertEquals(1, getByte(batch, 0)); |
| assertEquals(1024, getShort(batch, 0)); |
| assertEquals(65536, getInt(batch, 0)); |
| assertEquals(Long.MAX_VALUE, getLong(batch, 0)); |
| assertEquals(1.0, getFloat(batch, 0), 0.00001); |
| assertEquals(-15.0, getDouble(batch, 0), 0.00001); |
| assertEquals(bytes(0,1,2,3,4), getBinary(batch, 0)); |
| assertEquals("hi", getText(batch, 0).toString()); |
| List<InnerStruct> midRow = getMidList(batch, 0); |
| assertNotNull(midRow); |
| assertEquals(2, midRow.size()); |
| assertEquals(1, midRow.get(0).int1); |
| assertEquals("bye", midRow.get(0).string1.toString()); |
| assertEquals(2, midRow.get(1).int1); |
| assertEquals("sigh", midRow.get(1).string1.toString()); |
| List<InnerStruct> list = getList(batch, 0); |
| assertEquals(2, list.size()); |
| assertEquals(3, list.get(0).int1); |
| assertEquals("good", list.get(0).string1.toString()); |
| assertEquals(4, list.get(1).int1); |
| assertEquals("bad", list.get(1).string1.toString()); |
| Map<Text, InnerStruct> map = getMap(batch, 0); |
| assertEquals(0, map.size()); |
| |
| // check the contents of second row |
| assertEquals(true, getBoolean(batch, 1)); |
| assertEquals(100, getByte(batch, 1)); |
| assertEquals(2048, getShort(batch, 1)); |
| assertEquals(65536, getInt(batch, 1)); |
| assertEquals(Long.MAX_VALUE, getLong(batch, 1)); |
| assertEquals(2.0, getFloat(batch, 1), 0.00001); |
| assertEquals(-5.0, getDouble(batch, 1), 0.00001); |
| assertEquals(bytes(), getBinary(batch, 1)); |
| assertEquals("bye", getText(batch, 1).toString()); |
| midRow = getMidList(batch, 1); |
| assertNotNull(midRow); |
| assertEquals(2, midRow.size()); |
| assertEquals(1, midRow.get(0).int1); |
| assertEquals("bye", midRow.get(0).string1.toString()); |
| assertEquals(2, midRow.get(1).int1); |
| assertEquals("sigh", midRow.get(1).string1.toString()); |
| list = getList(batch, 1); |
| assertEquals(3, list.size()); |
| assertEquals(100000000, list.get(0).int1); |
| assertEquals("cat", list.get(0).string1.toString()); |
| assertEquals(-100000, list.get(1).int1); |
| assertEquals("in", list.get(1).string1.toString()); |
| assertEquals(1234, list.get(2).int1); |
| assertEquals("hat", list.get(2).string1.toString()); |
| map = getMap(batch, 1); |
| assertEquals(2, map.size()); |
| InnerStruct value = map.get(new Text("chani")); |
| assertEquals(5, value.int1); |
| assertEquals("chani", value.string1.toString()); |
| value = map.get(new Text("mauddib")); |
| assertEquals(1, value.int1); |
| assertEquals("mauddib", value.string1.toString()); |
| |
| // handle the close up |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| static void assertEmptyStats(ColumnStatistics[] writerStatistics) { |
| for (ColumnStatistics columnStatistics : writerStatistics){ |
| assertEquals(0, columnStatistics.getNumberOfValues()); |
| assertFalse(columnStatistics.hasNull()); |
| } |
| } |
| |
| @Test |
| public void testColumnProjection() throws Exception { |
| TypeDescription schema = createInnerSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(1000) |
| .compress(CompressionKind.NONE) |
| .bufferSize(100) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random r1 = new Random(1); |
| Random r2 = new Random(2); |
| int x; |
| int minInt=0, maxInt=0; |
| String y; |
| String minStr = null, maxStr = null; |
| batch.size = 1000; |
| boolean first = true; |
| for(int b=0; b < 21; ++b) { |
| for(int r=0; r < 1000; ++r) { |
| x = r1.nextInt(); |
| y = Long.toHexString(r2.nextLong()); |
| if (first || x < minInt) { |
| minInt = x; |
| } |
| if (first || x > maxInt) { |
| maxInt = x; |
| } |
| if (first || y.compareTo(minStr) < 0) { |
| minStr = y; |
| } |
| if (first || y.compareTo(maxStr) > 0) { |
| maxStr = y; |
| } |
| first = false; |
| ((LongColumnVector) batch.cols[0]).vector[r] = x; |
| ((BytesColumnVector) batch.cols[1]).setVal(r, y.getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| // check out the statistics |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertEquals(3, stats.length); |
| for(ColumnStatistics s: stats) { |
| assertEquals(21000, s.getNumberOfValues()); |
| if (s instanceof IntegerColumnStatistics) { |
| assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum()); |
| assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum()); |
| } else if (s instanceof StringColumnStatistics) { |
| assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum()); |
| assertEquals(minStr, ((StringColumnStatistics) s).getMinimum()); |
| } |
| } |
| |
| // check out the types |
| TypeDescription type = reader.getSchema(); |
| assertEquals(TypeDescription.Category.STRUCT, type.getCategory()); |
| assertEquals(2, type.getChildren().size()); |
| TypeDescription type1 = type.getChildren().get(0); |
| TypeDescription type2 = type.getChildren().get(1); |
| assertEquals(TypeDescription.Category.INT, type1.getCategory()); |
| assertEquals(TypeDescription.Category.STRING, type2.getCategory()); |
| assertEquals("struct<int1:int,string1:string>", type.toString()); |
| |
| // read the contents and make sure they match |
| RecordReader rows1 = reader.rows( |
| reader.options().include(new boolean[]{true, true, false})); |
| RecordReader rows2 = reader.rows( |
| reader.options().include(new boolean[]{true, false, true})); |
| r1 = new Random(1); |
| r2 = new Random(2); |
| VectorizedRowBatch batch1 = reader.getSchema().createRowBatch(1000); |
| VectorizedRowBatch batch2 = reader.getSchema().createRowBatch(1000); |
| for(int i = 0; i < 21000; i += 1000) { |
| Assert.assertEquals(true, rows1.nextBatch(batch1)); |
| Assert.assertEquals(true, rows2.nextBatch(batch2)); |
| assertEquals(1000, batch1.size); |
| assertEquals(1000, batch2.size); |
| for(int j=0; j < 1000; ++j) { |
| assertEquals(r1.nextInt(), |
| ((LongColumnVector) batch1.cols[0]).vector[j]); |
| assertEquals(Long.toHexString(r2.nextLong()), |
| ((BytesColumnVector) batch2.cols[1]).toString(j)); |
| } |
| } |
| Assert.assertEquals(false, rows1.nextBatch(batch1)); |
| Assert.assertEquals(false, rows2.nextBatch(batch2)); |
| rows1.close(); |
| rows2.close(); |
| } |
| |
| @Test |
| public void testEmptyFile() throws Exception { |
| TypeDescription schema = createBigRowSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(1000) |
| .compress(CompressionKind.NONE) |
| .bufferSize(100) |
| .version(fileFormat)); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| VectorizedRowBatch batch = reader.getSchema().createRowBatch(); |
| Assert.assertEquals(false, reader.rows().nextBatch(batch)); |
| Assert.assertEquals(CompressionKind.NONE, reader.getCompressionKind()); |
| Assert.assertEquals(0, reader.getNumberOfRows()); |
| Assert.assertEquals(0, reader.getCompressionSize()); |
| Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); |
| Assert.assertEquals(3, reader.getContentLength()); |
| Assert.assertEquals(false, reader.getStripes().iterator().hasNext()); |
| } |
| |
| @Test |
| public void metaData() throws Exception { |
| TypeDescription schema = createBigRowSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(1000) |
| .compress(CompressionKind.NONE) |
| .bufferSize(100) |
| .version(fileFormat)); |
| writer.addUserMetadata("my.meta", byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, |
| -128)); |
| writer.addUserMetadata("clobber", byteBuf(1, 2, 3)); |
| writer.addUserMetadata("clobber", byteBuf(4, 3, 2, 1)); |
| ByteBuffer bigBuf = ByteBuffer.allocate(40000); |
| Random random = new Random(0); |
| random.nextBytes(bigBuf.array()); |
| writer.addUserMetadata("big", bigBuf); |
| bigBuf.position(0); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1; |
| setBigRow(batch, 0, true, (byte) 127, (short) 1024, 42, |
| 42L * 1024 * 1024 * 1024, (float) 3.1415, -2.713, null, |
| null, null, null, null); |
| writer.addRowBatch(batch); |
| writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19)); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(byteBuf(5, 7, 11, 13, 17, 19), reader.getMetadataValue("clobber")); |
| Assert.assertEquals(byteBuf(1, 2, 3, 4, 5, 6, 7, -1, -2, 127, -128), |
| reader.getMetadataValue("my.meta")); |
| Assert.assertEquals(bigBuf, reader.getMetadataValue("big")); |
| try { |
| reader.getMetadataValue("unknown"); |
| assertTrue(false); |
| } catch (IllegalArgumentException iae) { |
| // PASS |
| } |
| int i = 0; |
| for(String key: reader.getMetadataKeys()) { |
| if ("my.meta".equals(key) || |
| "clobber".equals(key) || |
| "big".equals(key)) { |
| i += 1; |
| } else { |
| throw new IllegalArgumentException("unknown key " + key); |
| } |
| } |
| assertEquals(3, i); |
| int numStripes = reader.getStripeStatistics().size(); |
| assertEquals(1, numStripes); |
| } |
| |
| /** |
| * Generate an ORC file with a range of dates and times. |
| */ |
| public void createOrcDateFile(Path file, int minYear, int maxYear |
| ) throws IOException { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("time", TypeDescription.createTimestamp()) |
| .addField("date", TypeDescription.createDate()); |
| Writer writer = OrcFile.createWriter(file, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(100000) |
| .bufferSize(10000) |
| .blockPadding(false) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1000; |
| TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; |
| for (int year = minYear; year < maxYear; ++year) { |
| for (int row = 0; row < 1000; ++row) { |
| String timeStr = String.format("%04d-05-05 12:34:56.%04d", year, 2*row); |
| timestampColVector.set(row, Timestamp.valueOf(timeStr)); |
| } |
| ((LongColumnVector) batch.cols[1]).vector[0] = |
| new DateWritable(new Date(year - 1900, 11, 25)).getDays(); |
| batch.cols[1].isRepeating = true; |
| writer.addRowBatch(batch); |
| } |
| |
| // add one more row to check the statistics for the jvm bug case |
| batch.size = 1; |
| String timeStr = String.format("%04d-12-12 12:34:56.0001", maxYear-1); |
| timestampColVector.set(0, Timestamp.valueOf(timeStr)); |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| // check the stats to make sure they match up to the millisecond |
| ColumnStatistics[] stats = writer.getStatistics(); |
| TimestampColumnStatistics tsStat = (TimestampColumnStatistics) stats[1]; |
| assertEquals(String.format("%04d-12-12 12:34:56.0", maxYear - 1), |
| tsStat.getMaximum().toString()); |
| assertEquals(String.format("%04d-05-05 12:34:56.0", minYear), |
| tsStat.getMinimum().toString()); |
| |
| // read back the rows |
| Reader reader = OrcFile.createReader(file, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1000); |
| TimestampColumnVector times = (TimestampColumnVector) batch.cols[0]; |
| LongColumnVector dates = (LongColumnVector) batch.cols[1]; |
| for (int year = minYear; year < maxYear; ++year) { |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| for(int row = 0; row < 1000; ++row) { |
| Timestamp expected = Timestamp.valueOf( |
| String.format("%04d-05-05 12:34:56.%04d", year, 2*row)); |
| assertEquals("ms row " + row + " " + expected, expected.getTime(), |
| times.time[row]); |
| assertEquals("nanos row " + row + " " + expected, expected.getNanos(), |
| times.nanos[row]); |
| assertEquals("year " + year + " row " + row, |
| Integer.toString(year) + "-12-25", |
| new DateWritable((int) dates.vector[row]).toString()); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(1, batch.size); |
| } |
| |
| @Test |
| public void testDate1900() throws Exception { |
| createOrcDateFile(testFilePath, 1900, 1970); |
| } |
| |
| @Test |
| public void testDate2038() throws Exception { |
| createOrcDateFile(testFilePath, 2038, 2250); |
| } |
| |
| private static void setUnion(VectorizedRowBatch batch, int rowId, |
| Timestamp ts, Integer tag, Integer i, String s, |
| HiveDecimalWritable dec, Timestamp instant) { |
| UnionColumnVector union = (UnionColumnVector) batch.cols[1]; |
| if (ts != null) { |
| TimestampColumnVector timestampColVector = (TimestampColumnVector) batch.cols[0]; |
| timestampColVector.set(rowId, ts); |
| } else { |
| batch.cols[0].isNull[rowId] = true; |
| batch.cols[0].noNulls = false; |
| } |
| if (tag != null) { |
| union.tags[rowId] = tag; |
| if (tag == 0) { |
| if (i != null) { |
| ((LongColumnVector) union.fields[tag]).vector[rowId] = i; |
| } else { |
| union.fields[tag].isNull[rowId] = true; |
| union.fields[tag].noNulls = false; |
| } |
| } else if (tag == 1) { |
| if (s != null) { |
| ((BytesColumnVector) union.fields[tag]).setVal(rowId, s.getBytes(StandardCharsets.UTF_8)); |
| } else { |
| union.fields[tag].isNull[rowId] = true; |
| union.fields[tag].noNulls = false; |
| } |
| } else { |
| throw new IllegalArgumentException("Bad tag " + tag); |
| } |
| } else { |
| batch.cols[1].isNull[rowId] = true; |
| batch.cols[1].noNulls = false; |
| } |
| if (dec != null) { |
| ((DecimalColumnVector) batch.cols[2]).vector[rowId] = dec; |
| } else { |
| batch.cols[2].isNull[rowId] = true; |
| batch.cols[2].noNulls = false; |
| } |
| if (instant == null) { |
| batch.cols[3].isNull[rowId] = true; |
| batch.cols[3].noNulls = false; |
| } else { |
| ((TimestampColumnVector) batch.cols[3]).set(rowId, instant); |
| } |
| } |
| |
| /** |
| * Test writing with the new decimal and reading with the new and old. |
| */ |
| @Test |
| public void testDecimal64Writing() throws Exception { |
| TypeDescription schema = TypeDescription.fromString("struct<d:decimal(18,3)>"); |
| VectorizedRowBatch batch = schema.createRowBatchV2(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .compress(CompressionKind.NONE) |
| .version(fileFormat)); |
| Decimal64ColumnVector cv = (Decimal64ColumnVector) batch.cols[0]; |
| cv.precision = 18; |
| cv.scale = 3; |
| cv.vector[0] = 1; |
| for(int r=1; r < 18; r++) { |
| cv.vector[r] = cv.vector[r-1] * 10; |
| } |
| cv.vector[18] = -2000; |
| batch.size = 19; |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals("count: 19 hasNull: false", reader.getStatistics()[0].toString()); |
| // the size of the column in the different formats |
| int size = (fileFormat == OrcFile.Version.V_0_11 ? 89 : |
| fileFormat == OrcFile.Version.V_0_12 ? 90 : 154); |
| assertEquals("count: 19 hasNull: false bytesOnDisk: " + size + |
| " min: -2 max: 100000000000000 sum: 111111111111109.111", |
| reader.getStatistics()[1].toString()); |
| RecordReader rows = reader.rows(); |
| batch = schema.createRowBatchV2(); |
| cv = (Decimal64ColumnVector) batch.cols[0]; |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(19, batch.size); |
| assertEquals(18, cv.precision); |
| assertEquals(3, cv.scale); |
| assertEquals("row 0", 1, cv.vector[0]); |
| for(int r=1; r < 18; ++r) { |
| assertEquals("row " + r, 10 * cv.vector[r-1], cv.vector[r]); |
| } |
| assertEquals(-2000, cv.vector[18]); |
| assertFalse(rows.nextBatch(batch)); |
| |
| // test with old batch |
| rows = reader.rows(); |
| batch = schema.createRowBatch(); |
| DecimalColumnVector oldCv = (DecimalColumnVector) batch.cols[0]; |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(19, batch.size); |
| assertEquals(18, oldCv.precision); |
| assertEquals(3, oldCv.scale); |
| assertEquals("0.001", oldCv.vector[0].toString()); |
| assertEquals("0.01", oldCv.vector[1].toString()); |
| assertEquals("0.1", oldCv.vector[2].toString()); |
| assertEquals("1", oldCv.vector[3].toString()); |
| assertEquals("10", oldCv.vector[4].toString()); |
| assertEquals("100", oldCv.vector[5].toString()); |
| assertEquals("1000", oldCv.vector[6].toString()); |
| assertEquals("10000", oldCv.vector[7].toString()); |
| assertEquals("100000", oldCv.vector[8].toString()); |
| assertEquals("1000000", oldCv.vector[9].toString()); |
| assertEquals("10000000", oldCv.vector[10].toString()); |
| assertEquals("100000000", oldCv.vector[11].toString()); |
| assertEquals("1000000000", oldCv.vector[12].toString()); |
| assertEquals("10000000000", oldCv.vector[13].toString()); |
| assertEquals("100000000000", oldCv.vector[14].toString()); |
| assertEquals("1000000000000", oldCv.vector[15].toString()); |
| assertEquals("10000000000000", oldCv.vector[16].toString()); |
| assertEquals("100000000000000", oldCv.vector[17].toString()); |
| assertEquals("-2", oldCv.vector[18].toString()); |
| assertFalse(rows.nextBatch(batch)); |
| } |
| |
| /** |
| * Test writing with the old decimal and reading with the new and old. |
| */ |
| @Test |
| public void testDecimal64Reading() throws Exception { |
| TypeDescription schema = TypeDescription.fromString("struct<d:decimal(18,4)>"); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .compress(CompressionKind.NONE) |
| .version(fileFormat)); |
| DecimalColumnVector cv = (DecimalColumnVector) batch.cols[0]; |
| cv.precision = 18; |
| cv.scale = 3; |
| long base = 1; |
| for(int r=0; r < 18; r++) { |
| cv.vector[r].setFromLongAndScale(base, 4); |
| base *= 10; |
| } |
| cv.vector[18].setFromLong(-2); |
| batch.size = 19; |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| // test with new batch |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals("count: 19 hasNull: false", reader.getStatistics()[0].toString()); |
| // the size of the column in the different formats |
| int size = (fileFormat == OrcFile.Version.V_0_11 ? 63 : |
| fileFormat == OrcFile.Version.V_0_12 ? 65 : 154); |
| assertEquals("count: 19 hasNull: false bytesOnDisk: " + size + |
| " min: -2 max: 10000000000000 sum: 11111111111109.1111", |
| reader.getStatistics()[1].toString()); |
| |
| RecordReader rows = reader.rows(); |
| batch = schema.createRowBatchV2(); |
| Decimal64ColumnVector newCv = (Decimal64ColumnVector) batch.cols[0]; |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(19, batch.size); |
| assertEquals(18, newCv.precision); |
| assertEquals(4, newCv.scale); |
| assertEquals("row 0", 1, newCv.vector[0]); |
| for(int r=1; r < 18; ++r) { |
| assertEquals("row " + r, 10 * newCv.vector[r-1], newCv.vector[r]); |
| } |
| assertEquals(-20000, newCv.vector[18]); |
| assertFalse(rows.nextBatch(batch)); |
| |
| // test with old batch |
| rows = reader.rows(); |
| batch = schema.createRowBatch(); |
| cv = (DecimalColumnVector) batch.cols[0]; |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(19, batch.size); |
| assertEquals(18, cv.precision); |
| assertEquals(4, cv.scale); |
| assertEquals("0.0001", cv.vector[0].toString()); |
| assertEquals("0.001", cv.vector[1].toString()); |
| assertEquals("0.01", cv.vector[2].toString()); |
| assertEquals("0.1", cv.vector[3].toString()); |
| assertEquals("1", cv.vector[4].toString()); |
| assertEquals("10", cv.vector[5].toString()); |
| assertEquals("100", cv.vector[6].toString()); |
| assertEquals("1000", cv.vector[7].toString()); |
| assertEquals("10000", cv.vector[8].toString()); |
| assertEquals("100000", cv.vector[9].toString()); |
| assertEquals("1000000", cv.vector[10].toString()); |
| assertEquals("10000000", cv.vector[11].toString()); |
| assertEquals("100000000", cv.vector[12].toString()); |
| assertEquals("1000000000", cv.vector[13].toString()); |
| assertEquals("10000000000", cv.vector[14].toString()); |
| assertEquals("100000000000", cv.vector[15].toString()); |
| assertEquals("1000000000000", cv.vector[16].toString()); |
| assertEquals("10000000000000", cv.vector[17].toString()); |
| assertEquals("-2", cv.vector[18].toString()); |
| assertFalse(rows.nextBatch(batch)); |
| } |
| |
| /** |
| * We test union, timestamp, and decimal separately since we need to make the |
| * object inspector manually. (The Hive reflection-based doesn't handle |
| * them properly.) |
| */ |
| @Test |
| public void testUnionAndTimestamp() throws Exception { |
| final TimeZone original = TimeZone.getDefault(); |
| TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")); |
| TypeDescription schema = TypeDescription.fromString( |
| "struct<time:timestamp," + |
| "union:uniontype<int,string>," + |
| "decimal:decimal(38,18)," + |
| "instant:timestamp with local time zone>" |
| ); |
| HiveDecimal maxValue = HiveDecimal.create("10000000000000000000"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(1000) |
| .compress(CompressionKind.NONE) |
| .bufferSize(100) |
| .blockPadding(false) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 6; |
| setUnion(batch, 0, Timestamp.valueOf("2000-03-12 15:00:00"), 0, 42, null, |
| new HiveDecimalWritable("12345678.6547456"), |
| Timestamp.valueOf("2014-12-12 6:00:00")); |
| setUnion(batch, 1, Timestamp.valueOf("2000-03-20 12:00:00.123456789"), |
| 1, null, "hello", new HiveDecimalWritable("-5643.234"), |
| Timestamp.valueOf("1996-12-11 11:00:00")); |
| |
| setUnion(batch, 2, null, null, null, null, null, null); |
| setUnion(batch, 3, null, 0, null, null, null, null); |
| setUnion(batch, 4, null, 1, null, null, null, null); |
| |
| setUnion(batch, 5, Timestamp.valueOf("1970-01-01 00:00:00"), 0, 200000, |
| null, new HiveDecimalWritable("10000000000000000000"), |
| Timestamp.valueOf("2011-07-01 09:00:00")); |
| writer.addRowBatch(batch); |
| |
| batch.reset(); |
| Random rand = new Random(42); |
| for(int i=1970; i < 2038; ++i) { |
| Timestamp ts = Timestamp.valueOf(i + "-05-05 12:34:56." + i); |
| HiveDecimal dec = |
| HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18)); |
| if ((i & 1) == 0) { |
| setUnion(batch, batch.size++, ts, 0, i*i, null, |
| new HiveDecimalWritable(dec), null); |
| } else { |
| setUnion(batch, batch.size++, ts, 1, null, Integer.toString(i*i), |
| new HiveDecimalWritable(dec), null); |
| } |
| if (maxValue.compareTo(dec) < 0) { |
| maxValue = dec; |
| } |
| } |
| writer.addRowBatch(batch); |
| batch.reset(); |
| |
| // let's add a lot of constant rows to test the rle |
| batch.size = 1000; |
| for(int c=0; c < batch.cols.length; ++c) { |
| batch.cols[c].setRepeating(true); |
| } |
| ((UnionColumnVector) batch.cols[1]).fields[0].isRepeating = true; |
| setUnion(batch, 0, null, 0, 1732050807, null, null, null); |
| for(int i=0; i < 5; ++i) { |
| writer.addRowBatch(batch); |
| } |
| |
| batch.reset(); |
| batch.size = 3; |
| setUnion(batch, 0, null, 0, 0, null, null, null); |
| setUnion(batch, 1, null, 0, 10, null, null, null); |
| setUnion(batch, 2, null, 0, 138, null, null, null); |
| writer.addRowBatch(batch); |
| // check the stats on the writer side |
| ColumnStatistics[] stats = writer.getStatistics(); |
| assertEquals("1996-12-11 11:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMinimum().toString()); |
| assertEquals("1996-12-11 11:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMinimumUTC().toString()); |
| assertEquals("2014-12-12 06:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMaximum().toString()); |
| assertEquals("2014-12-12 06:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMaximumUTC().toString()); |
| |
| writer.close(); |
| |
| TimeZone.setDefault(TimeZone.getTimeZone("America/New_York")); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| stats = reader.getStatistics(); |
| |
| // check the timestamp statistics |
| assertEquals("1970-01-01 00:00:00.0", |
| ((TimestampColumnStatistics) stats[1]).getMinimum().toString()); |
| assertEquals("1969-12-31 19:00:00.0", |
| ((TimestampColumnStatistics) stats[1]).getMinimumUTC().toString()); |
| assertEquals("2037-05-05 12:34:56.203", |
| ((TimestampColumnStatistics) stats[1]).getMaximum().toString()); |
| assertEquals("2037-05-05 08:34:56.203", |
| ((TimestampColumnStatistics) stats[1]).getMaximumUTC().toString()); |
| |
| // check the instant statistics |
| assertEquals("1996-12-11 14:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMinimum().toString()); |
| assertEquals("1996-12-11 14:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMinimumUTC().toString()); |
| assertEquals("2014-12-12 09:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMaximum().toString()); |
| assertEquals("2014-12-12 09:00:00.0", |
| ((TimestampColumnStatistics) stats[6]).getMaximumUTC().toString()); |
| |
| |
| schema = writer.getSchema(); |
| assertEquals(6, schema.getMaximumId()); |
| boolean[] expected = new boolean[] {false, false, false, false, false, false, false}; |
| boolean[] included = OrcUtils.includeColumns("", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| expected = new boolean[] {false, true, false, false, false, true, false}; |
| included = OrcUtils.includeColumns("time,decimal", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| expected = new boolean[] {false, false, true, true, true, false, false}; |
| included = OrcUtils.includeColumns("union", schema); |
| assertEquals(true, Arrays.equals(expected, included)); |
| |
| Assert.assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); |
| Assert.assertEquals(5077, reader.getNumberOfRows()); |
| DecimalColumnStatistics decStats = |
| (DecimalColumnStatistics) reader.getStatistics()[5]; |
| assertEquals(71, decStats.getNumberOfValues()); |
| assertEquals(HiveDecimal.create("-5643.234"), decStats.getMinimum()); |
| assertEquals(maxValue, decStats.getMaximum()); |
| // TODO: fix this |
| // assertEquals(null,stats.getSum()); |
| int stripeCount = 0; |
| int rowCount = 0; |
| long currentOffset = -1; |
| for(StripeInformation stripe: reader.getStripes()) { |
| stripeCount += 1; |
| rowCount += stripe.getNumberOfRows(); |
| if (currentOffset < 0) { |
| currentOffset = stripe.getOffset() + stripe.getLength(); |
| } else { |
| assertEquals(currentOffset, stripe.getOffset()); |
| currentOffset += stripe.getLength(); |
| } |
| } |
| Assert.assertEquals(reader.getNumberOfRows(), rowCount); |
| assertEquals(2, stripeCount); |
| Assert.assertEquals(reader.getContentLength(), currentOffset); |
| RecordReader rows = reader.rows(); |
| Assert.assertEquals(0, rows.getRowNumber()); |
| Assert.assertEquals(0.0, rows.getProgress(), 0.000001); |
| |
| schema = reader.getSchema(); |
| batch = schema.createRowBatch(74); |
| Assert.assertEquals(0, rows.getRowNumber()); |
| rows.nextBatch(batch); |
| assertEquals(74, batch.size); |
| Assert.assertEquals(74, rows.getRowNumber()); |
| TimestampColumnVector ts = (TimestampColumnVector) batch.cols[0]; |
| UnionColumnVector union = (UnionColumnVector) batch.cols[1]; |
| LongColumnVector longs = (LongColumnVector) union.fields[0]; |
| BytesColumnVector strs = (BytesColumnVector) union.fields[1]; |
| DecimalColumnVector decs = (DecimalColumnVector) batch.cols[2]; |
| TimestampColumnVector instant = (TimestampColumnVector) batch.cols[3]; |
| |
| assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)," + |
| "instant:timestamp with local time zone>", |
| schema.toString()); |
| assertEquals("2000-03-12 15:00:00.0", ts.asScratchTimestamp(0).toString()); |
| assertEquals(0, union.tags[0]); |
| assertEquals(42, longs.vector[0]); |
| assertEquals("12345678.6547456", decs.vector[0].toString()); |
| assertEquals("2014-12-12 09:00:00.0", instant.asScratchTimestamp(0).toString()); |
| |
| assertEquals("2000-03-20 12:00:00.123456789", ts.asScratchTimestamp(1).toString()); |
| assertEquals(1, union.tags[1]); |
| assertEquals("hello", strs.toString(1)); |
| assertEquals("-5643.234", decs.vector[1].toString()); |
| assertEquals("1996-12-11 14:00:00.0", instant.asScratchTimestamp(1).toString()); |
| |
| assertEquals(false, ts.noNulls); |
| assertEquals(false, union.noNulls); |
| assertEquals(false, decs.noNulls); |
| assertEquals(true, ts.isNull[2]); |
| assertEquals(true, union.isNull[2]); |
| assertEquals(true, decs.isNull[2]); |
| |
| assertEquals(true, ts.isNull[3]); |
| assertEquals(false, union.isNull[3]); |
| assertEquals(0, union.tags[3]); |
| assertEquals(true, longs.isNull[3]); |
| assertEquals(true, decs.isNull[3]); |
| |
| assertEquals(true, ts.isNull[4]); |
| assertEquals(false, union.isNull[4]); |
| assertEquals(1, union.tags[4]); |
| assertEquals(true, strs.isNull[4]); |
| assertEquals(true, decs.isNull[4]); |
| |
| assertEquals(false, ts.isNull[5]); |
| assertEquals("1970-01-01 00:00:00.0", ts.asScratchTimestamp(5).toString()); |
| assertEquals(false, union.isNull[5]); |
| assertEquals(0, union.tags[5]); |
| assertEquals(false, longs.isNull[5]); |
| assertEquals(200000, longs.vector[5]); |
| assertEquals(false, decs.isNull[5]); |
| assertEquals("10000000000000000000", decs.vector[5].toString()); |
| assertEquals("2011-07-01 12:00:00.0", instant.asScratchTimestamp(5).toString()); |
| |
| rand = new Random(42); |
| for(int i=1970; i < 2038; ++i) { |
| int row = 6 + i - 1970; |
| assertEquals(Timestamp.valueOf(i + "-05-05 12:34:56." + i), |
| ts.asScratchTimestamp(row)); |
| if ((i & 1) == 0) { |
| assertEquals(0, union.tags[row]); |
| assertEquals(i*i, longs.vector[row]); |
| } else { |
| assertEquals(1, union.tags[row]); |
| assertEquals(Integer.toString(i * i), strs.toString(row)); |
| } |
| assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand), |
| rand.nextInt(18))), decs.vector[row]); |
| } |
| |
| // rebuild the row batch, so that we can read by 1000 rows |
| batch = schema.createRowBatch(1000); |
| ts = (TimestampColumnVector) batch.cols[0]; |
| union = (UnionColumnVector) batch.cols[1]; |
| longs = (LongColumnVector) union.fields[0]; |
| strs = (BytesColumnVector) union.fields[1]; |
| decs = (DecimalColumnVector) batch.cols[2]; |
| |
| for(int i=0; i < 5; ++i) { |
| rows.nextBatch(batch); |
| assertEquals("batch " + i, 1000, batch.size); |
| assertEquals("batch " + i, false, union.isRepeating); |
| assertEquals("batch " + i, true, union.noNulls); |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals("bad tag at " + i + "." +r, 0, union.tags[r]); |
| } |
| assertEquals("batch " + i, true, longs.isRepeating); |
| assertEquals("batch " + i, 1732050807, longs.vector[0]); |
| } |
| |
| rows.nextBatch(batch); |
| assertEquals(3, batch.size); |
| assertEquals(0, union.tags[0]); |
| assertEquals(0, longs.vector[0]); |
| assertEquals(0, union.tags[1]); |
| assertEquals(10, longs.vector[1]); |
| assertEquals(0, union.tags[2]); |
| assertEquals(138, longs.vector[2]); |
| |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| Assert.assertEquals(1.0, rows.getProgress(), 0.00001); |
| Assert.assertEquals(reader.getNumberOfRows(), rows.getRowNumber()); |
| rows.seekToRow(1); |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| assertEquals(Timestamp.valueOf("2000-03-20 12:00:00.123456789"), ts.asScratchTimestamp(0)); |
| assertEquals(1, union.tags[0]); |
| assertEquals("hello", strs.toString(0)); |
| assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), decs.vector[0]); |
| rows.close(); |
| TimeZone.setDefault(original); |
| } |
| |
| /** |
| * Read and write a randomly generated snappy file. |
| * @throws Exception |
| */ |
| @Test |
| public void testSnappy() throws Exception { |
| TypeDescription schema = createInnerSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(1000) |
| .compress(CompressionKind.SNAPPY) |
| .bufferSize(100) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand; |
| writeRandomIntBytesBatches(writer, batch, 10, 1000); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1000); |
| rand = new Random(12); |
| LongColumnVector longs = (LongColumnVector) batch.cols[0]; |
| BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; |
| for(int b=0; b < 10; ++b) { |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals(rand.nextInt(), longs.vector[r]); |
| assertEquals(Integer.toHexString(rand.nextInt()), strs.toString(r)); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| rows.close(); |
| } |
| |
| /** |
| * Read and write a randomly generated lzo file. |
| * @throws Exception |
| */ |
| @Test |
| public void testLzo() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString("struct<x:bigint,y:int,z:bigint>"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(10000) |
| .compress(CompressionKind.LZO) |
| .bufferSize(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand = new Random(69); |
| batch.size = 1000; |
| for(int b=0; b < 10; ++b) { |
| for (int r=0; r < 1000; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt(); |
| ((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r; |
| ((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong(); |
| } |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals(CompressionKind.LZO, reader.getCompressionKind()); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1000); |
| rand = new Random(69); |
| for(int b=0; b < 10; ++b) { |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals(rand.nextInt(), |
| ((LongColumnVector) batch.cols[0]).vector[r]); |
| assertEquals(b * 1000 + r, |
| ((LongColumnVector) batch.cols[1]).vector[r]); |
| assertEquals(rand.nextLong(), |
| ((LongColumnVector) batch.cols[2]).vector[r]); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| rows.close(); |
| } |
| |
| /** |
| * Read and write a randomly generated lz4 file. |
| * @throws Exception |
| */ |
| @Test |
| public void testLz4() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString("struct<x:bigint,y:int,z:bigint>"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(10000) |
| .compress(CompressionKind.LZ4) |
| .bufferSize(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand = new Random(3); |
| batch.size = 1000; |
| for(int b=0; b < 10; ++b) { |
| for (int r=0; r < 1000; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt(); |
| ((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r; |
| ((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong(); |
| } |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals(CompressionKind.LZ4, reader.getCompressionKind()); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1000); |
| rand = new Random(3); |
| for(int b=0; b < 10; ++b) { |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals(rand.nextInt(), |
| ((LongColumnVector) batch.cols[0]).vector[r]); |
| assertEquals(b * 1000 + r, |
| ((LongColumnVector) batch.cols[1]).vector[r]); |
| assertEquals(rand.nextLong(), |
| ((LongColumnVector) batch.cols[2]).vector[r]); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| rows.close(); |
| } |
| |
| /** |
| * Read and write a randomly generated zstd file. |
| */ |
| @Test |
| public void testZstd() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString("struct<x:bigint,y:int,z:bigint>"); |
| try (Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .compress(CompressionKind.ZSTD) |
| .bufferSize(1000) |
| .version(fileFormat))) { |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand = new Random(3); |
| batch.size = 1000; |
| for (int b = 0; b < 10; ++b) { |
| for (int r = 0; r < 1000; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt(); |
| ((LongColumnVector) batch.cols[1]).vector[r] = b * 1000 + r; |
| ((LongColumnVector) batch.cols[2]).vector[r] = rand.nextLong(); |
| } |
| writer.addRowBatch(batch); |
| } |
| } |
| try (Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| RecordReader rows = reader.rows()) { |
| assertEquals(CompressionKind.ZSTD, reader.getCompressionKind()); |
| VectorizedRowBatch batch = reader.getSchema().createRowBatch(1000); |
| Random rand = new Random(3); |
| for (int b = 0; b < 10; ++b) { |
| rows.nextBatch(batch); |
| assertEquals(1000, batch.size); |
| for (int r = 0; r < batch.size; ++r) { |
| assertEquals(rand.nextInt(), |
| ((LongColumnVector) batch.cols[0]).vector[r]); |
| assertEquals(b * 1000 + r, |
| ((LongColumnVector) batch.cols[1]).vector[r]); |
| assertEquals(rand.nextLong(), |
| ((LongColumnVector) batch.cols[2]).vector[r]); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| } |
| } |
| |
| /** |
| * Read and write a file; verify codec usage. |
| * @throws Exception |
| */ |
| @Test |
| public void testCodecPool() throws Exception { |
| OrcCodecPool.clear(); |
| TypeDescription schema = createInnerSchema(); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| WriterOptions opts = OrcFile.writerOptions(conf) |
| .setSchema(schema).stripeSize(1000).bufferSize(100).version(fileFormat); |
| |
| CompressionCodec snappyCodec, zlibCodec; |
| snappyCodec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.SNAPPY), batch); |
| assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY)); |
| Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); |
| CompressionCodec codec = readBatchesAndGetCodec(reader, 10, 1000); |
| assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY)); |
| assertSame(snappyCodec, codec); |
| |
| reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); |
| codec = readBatchesAndGetCodec(reader, 10, 1000); |
| assertSame(snappyCodec, codec); |
| assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY)); |
| |
| zlibCodec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.ZLIB), batch); |
| assertNotSame(snappyCodec, zlibCodec); |
| assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.ZLIB)); |
| codec = writeBatchesAndGetCodec(10, 1000, opts.compress(CompressionKind.ZLIB), batch); |
| assertEquals(1, OrcCodecPool.getPoolSize(CompressionKind.ZLIB)); |
| assertSame(zlibCodec, codec); |
| |
| assertSame(snappyCodec, OrcCodecPool.getCodec(CompressionKind.SNAPPY)); |
| CompressionCodec snappyCodec2 = writeBatchesAndGetCodec( |
| 10, 1000, opts.compress(CompressionKind.SNAPPY), batch); |
| assertNotSame(snappyCodec, snappyCodec2); |
| OrcCodecPool.returnCodec(CompressionKind.SNAPPY, snappyCodec); |
| reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(CompressionKind.SNAPPY, reader.getCompressionKind()); |
| codec = readBatchesAndGetCodec(reader, 10, 1000); |
| assertEquals(2, OrcCodecPool.getPoolSize(CompressionKind.SNAPPY)); |
| assertTrue(snappyCodec == codec || snappyCodec2 == codec); |
| } |
| |
| private CompressionCodec writeBatchesAndGetCodec(int count, |
| int size, |
| WriterOptions opts, |
| VectorizedRowBatch batch |
| ) throws IOException { |
| fs.delete(testFilePath, false); |
| Writer writer = OrcFile.createWriter(testFilePath, opts); |
| CompressionCodec codec = ((WriterImpl) writer).getCompressionCodec(); |
| writeRandomIntBytesBatches(writer, batch, count, size); |
| writer.close(); |
| return codec; |
| } |
| |
| private CompressionCodec readBatchesAndGetCodec( |
| Reader reader, int count, int size) throws IOException { |
| RecordReader rows = reader.rows(); |
| VectorizedRowBatch batch = reader.getSchema().createRowBatch(size); |
| for (int b = 0; b < count; ++b) { |
| rows.nextBatch(batch); |
| } |
| CompressionCodec codec = ((RecordReaderImpl)rows).getCompressionCodec(); |
| rows.close(); |
| return codec; |
| } |
| |
| private void readRandomBatches( |
| Reader reader, RecordReader rows, int count, int size) throws IOException { |
| |
| } |
| |
| private void writeRandomIntBytesBatches( |
| Writer writer, VectorizedRowBatch batch, int count, int size) throws IOException { |
| Random rand = new Random(12); |
| batch.size = size; |
| for(int b=0; b < count; ++b) { |
| for (int r=0; r < size; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = rand.nextInt(); |
| ((BytesColumnVector) batch.cols[1]).setVal(r, |
| Integer.toHexString(rand.nextInt()).getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| } |
| } |
| |
| /** |
| * Read and write a randomly generated snappy file. |
| * @throws Exception |
| */ |
| @Test |
| public void testWithoutIndex() throws Exception { |
| TypeDescription schema = createInnerSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(5000) |
| .compress(CompressionKind.SNAPPY) |
| .bufferSize(1000) |
| .rowIndexStride(0) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand = new Random(24); |
| batch.size = 5; |
| for(int c=0; c < batch.cols.length; ++c) { |
| batch.cols[c].setRepeating(true); |
| } |
| for(int i=0; i < 10000; ++i) { |
| ((LongColumnVector) batch.cols[0]).vector[0] = rand.nextInt(); |
| ((BytesColumnVector) batch.cols[1]) |
| .setVal(0, Integer.toBinaryString(rand.nextInt()).getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(50000, reader.getNumberOfRows()); |
| Assert.assertEquals(0, reader.getRowIndexStride()); |
| StripeInformation stripe = reader.getStripes().iterator().next(); |
| assertEquals(true, stripe.getDataLength() != 0); |
| assertEquals(0, stripe.getIndexLength()); |
| RecordReader rows = reader.rows(); |
| rand = new Random(24); |
| batch = reader.getSchema().createRowBatch(1000); |
| LongColumnVector longs = (LongColumnVector) batch.cols[0]; |
| BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; |
| for(int i=0; i < 50; ++i) { |
| rows.nextBatch(batch); |
| assertEquals("batch " + i, 1000, batch.size); |
| for(int j=0; j < 200; ++j) { |
| int intVal = rand.nextInt(); |
| String strVal = Integer.toBinaryString(rand.nextInt()); |
| for (int k = 0; k < 5; ++k) { |
| assertEquals(intVal, longs.vector[j * 5 + k]); |
| assertEquals(strVal, strs.toString(j * 5 + k)); |
| } |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| rows.close(); |
| } |
| |
| @Test |
| public void testSeek() throws Exception { |
| TypeDescription schema = createBigRowSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(200000) |
| .bufferSize(65536) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Random rand = new Random(42); |
| final int COUNT=32768; |
| long[] intValues= new long[COUNT]; |
| double[] doubleValues = new double[COUNT]; |
| String[] stringValues = new String[COUNT]; |
| BytesWritable[] byteValues = new BytesWritable[COUNT]; |
| String[] words = new String[128]; |
| for(int i=0; i < words.length; ++i) { |
| words[i] = Integer.toHexString(rand.nextInt()); |
| } |
| for(int i=0; i < COUNT/2; ++i) { |
| intValues[2*i] = rand.nextLong(); |
| intValues[2*i+1] = intValues[2*i]; |
| stringValues[2*i] = words[rand.nextInt(words.length)]; |
| stringValues[2*i+1] = stringValues[2*i]; |
| } |
| for(int i=0; i < COUNT; ++i) { |
| doubleValues[i] = rand.nextDouble(); |
| byte[] buf = new byte[20]; |
| rand.nextBytes(buf); |
| byteValues[i] = new BytesWritable(buf); |
| } |
| for(int i=0; i < COUNT; ++i) { |
| appendRandomRow(batch, intValues, doubleValues, stringValues, |
| byteValues, words, i); |
| if (batch.size == 1024) { |
| writer.addRowBatch(batch); |
| batch.reset(); |
| } |
| } |
| if (batch.size != 0) { |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| Assert.assertEquals(COUNT, reader.getNumberOfRows()); |
| RecordReader rows = reader.rows(); |
| // get the row index |
| InStream.StreamOptions options = InStream.options(); |
| if (reader.getCompressionKind() != CompressionKind.NONE) { |
| options.withCodec(OrcCodecPool.getCodec(reader.getCompressionKind())) |
| .withBufferSize(reader.getCompressionSize()); |
| } |
| DataReader meta = RecordReaderUtils.createDefaultDataReader( |
| DataReaderProperties.builder() |
| .withFileSystem(fs) |
| .withPath(testFilePath) |
| .withCompression(options) |
| .withZeroCopy(false) |
| .build()); |
| StripePlanner planner = new StripePlanner(schema, new ReaderEncryption(), |
| meta, reader.getWriterVersion(), true, Integer.MAX_VALUE); |
| boolean[] columns = new boolean[schema.getMaximumId() + 1]; |
| Arrays.fill(columns, true); |
| OrcIndex index = planner.parseStripe(reader.getStripes().get(0), columns) |
| .readRowIndex(null, null); |
| // check the primitive columns to make sure they have the right number of |
| // items in the first row group |
| for(int c=1; c < 9; ++c) { |
| OrcProto.RowIndex colIndex = index.getRowGroupIndex()[c]; |
| assertEquals(1000, |
| colIndex.getEntry(0).getStatistics().getNumberOfValues()); |
| } |
| batch = reader.getSchema().createRowBatch(); |
| int nextRowInBatch = -1; |
| for(int i=COUNT-1; i >= 0; --i, --nextRowInBatch) { |
| // if we have consumed the previous batch read a new one |
| if (nextRowInBatch < 0) { |
| long base = Math.max(i - 1023, 0); |
| rows.seekToRow(base); |
| Assert.assertEquals("row " + i, true, rows.nextBatch(batch)); |
| nextRowInBatch = batch.size - 1; |
| } |
| checkRandomRow(batch, intValues, doubleValues, |
| stringValues, byteValues, words, i, nextRowInBatch); |
| } |
| rows.close(); |
| Iterator<StripeInformation> stripeIterator = |
| reader.getStripes().iterator(); |
| long offsetOfStripe2 = 0; |
| long offsetOfStripe4 = 0; |
| long lastRowOfStripe2 = 0; |
| for(int i = 0; i < 5; ++i) { |
| StripeInformation stripe = stripeIterator.next(); |
| if (i < 2) { |
| lastRowOfStripe2 += stripe.getNumberOfRows(); |
| } else if (i == 2) { |
| offsetOfStripe2 = stripe.getOffset(); |
| lastRowOfStripe2 += stripe.getNumberOfRows() - 1; |
| } else if (i == 4) { |
| offsetOfStripe4 = stripe.getOffset(); |
| } |
| } |
| Arrays.fill(columns, false); |
| columns[5] = true; // long colulmn |
| columns[9] = true; // text column |
| rows = reader.rows(reader.options() |
| .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2) |
| .include(columns)); |
| rows.seekToRow(lastRowOfStripe2); |
| // we only want two rows |
| batch = reader.getSchema().createRowBatch(2); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1, batch.size); |
| assertEquals(intValues[(int) lastRowOfStripe2], getLong(batch, 0)); |
| assertEquals(stringValues[(int) lastRowOfStripe2], |
| getText(batch, 0).toString()); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(intValues[(int) lastRowOfStripe2 + 1], getLong(batch, 0)); |
| assertEquals(stringValues[(int) lastRowOfStripe2 + 1], |
| getText(batch, 0).toString()); |
| rows.close(); |
| } |
| |
| private void appendRandomRow(VectorizedRowBatch batch, |
| long[] intValues, double[] doubleValues, |
| String[] stringValues, |
| BytesWritable[] byteValues, |
| String[] words, int i) { |
| InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]); |
| InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32), |
| words[i % words.length] + "-x"); |
| setBigRow(batch, batch.size++, (intValues[i] & 1) == 0, (byte) intValues[i], |
| (short) intValues[i], (int) intValues[i], intValues[i], |
| (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i], |
| new MiddleStruct(inner, inner2), list(), map(inner, inner2)); |
| } |
| |
| private void checkRandomRow(VectorizedRowBatch batch, |
| long[] intValues, double[] doubleValues, |
| String[] stringValues, |
| BytesWritable[] byteValues, |
| String[] words, int i, int rowInBatch) { |
| InnerStruct inner = new InnerStruct((int) intValues[i], stringValues[i]); |
| InnerStruct inner2 = new InnerStruct((int) (intValues[i] >> 32), |
| words[i % words.length] + "-x"); |
| checkBigRow(batch, rowInBatch, i, (intValues[i] & 1) == 0, (byte) intValues[i], |
| (short) intValues[i], (int) intValues[i], intValues[i], |
| (float) doubleValues[i], doubleValues[i], byteValues[i], stringValues[i], |
| new MiddleStruct(inner, inner2), list(), map(inner, inner2)); |
| } |
| |
| @Test |
| public void testMemoryManagement() throws Exception { |
| OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 100); |
| final long POOL_SIZE = 50_000; |
| TypeDescription schema = createInnerSchema(); |
| MemoryManagerImpl memoryMgr = new MemoryManagerImpl(POOL_SIZE); |
| |
| // set up 10 files that all request the full size. |
| MemoryManager.Callback ignore = newScale -> false; |
| for(int f=0; f < 9; ++f) { |
| memoryMgr.addWriter(new Path("file-" + f), POOL_SIZE, ignore); |
| } |
| |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .compress(CompressionKind.NONE) |
| .stripeSize(POOL_SIZE) |
| .bufferSize(100) |
| .rowIndexStride(0) |
| .memory(memoryMgr) |
| .version(fileFormat)); |
| // check to make sure it is 10% |
| assertEquals(0.1, memoryMgr.getAllocationScale(), 0.001); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1; |
| for(int i=0; i < 2500; ++i) { |
| ((LongColumnVector) batch.cols[0]).vector[0] = i * 300; |
| ((BytesColumnVector) batch.cols[1]).setVal(0, |
| Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| assertEquals(0.111, memoryMgr.getAllocationScale(), 0.001); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| int i = 0; |
| for(StripeInformation stripe: reader.getStripes()) { |
| i += 1; |
| assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(), |
| stripe.getDataLength() < POOL_SIZE); |
| } |
| // 0.11 always uses the dictionary, so ends up with a lot more stripes |
| assertEquals(fileFormat == OrcFile.Version.V_0_11 ? 25 : 3, i); |
| assertEquals(2500, reader.getNumberOfRows()); |
| } |
| |
| @Test |
| public void testPredicatePushdown() throws Exception { |
| TypeDescription schema = createInnerSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(400000L) |
| .compress(CompressionKind.NONE) |
| .bufferSize(500) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.ensureSize(3500); |
| batch.size = 3500; |
| for(int i=0; i < 3500; ++i) { |
| ((LongColumnVector) batch.cols[0]).vector[i] = i * 300; |
| ((BytesColumnVector) batch.cols[1]).setVal(i, |
| Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals(3500, reader.getNumberOfRows()); |
| |
| SearchArgument sarg = SearchArgumentFactory.newBuilder() |
| .startAnd() |
| .startNot() |
| .lessThan("int1", PredicateLeaf.Type.LONG, 300000L) |
| .end() |
| .lessThan("int1", PredicateLeaf.Type.LONG, 600000L) |
| .end() |
| .build(); |
| RecordReader rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true, true}) |
| .searchArgument(sarg, new String[]{null, "int1", "string1"})); |
| batch = reader.getSchema().createRowBatch(2000); |
| LongColumnVector ints = (LongColumnVector) batch.cols[0]; |
| BytesColumnVector strs = (BytesColumnVector) batch.cols[1]; |
| |
| Assert.assertEquals(1000L, rows.getRowNumber()); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1000, batch.size); |
| |
| for(int i=1000; i < 2000; ++i) { |
| assertEquals(300 * i, ints.vector[i - 1000]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000)); |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| |
| // look through the file with no rows selected |
| sarg = SearchArgumentFactory.newBuilder() |
| .startAnd() |
| .lessThan("int1", PredicateLeaf.Type.LONG, 0L) |
| .end() |
| .build(); |
| rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true, true}) |
| .searchArgument(sarg, new String[]{null, "int1", "string1"})); |
| Assert.assertEquals(3500L, rows.getRowNumber()); |
| assertTrue(!rows.nextBatch(batch)); |
| |
| // select first 100 and last 100 rows |
| sarg = SearchArgumentFactory.newBuilder() |
| .startOr() |
| .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 100) |
| .startNot() |
| .lessThan("int1", PredicateLeaf.Type.LONG, 300L * 3400) |
| .end() |
| .end() |
| .build(); |
| rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true, true}) |
| .searchArgument(sarg, new String[]{null, "int1", "string1"})); |
| Assert.assertEquals(0, rows.getRowNumber()); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1000, batch.size); |
| Assert.assertEquals(3000, rows.getRowNumber()); |
| for(int i=0; i < 1000; ++i) { |
| assertEquals(300 * i, ints.vector[i]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i)); |
| } |
| |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(500, batch.size); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| for(int i=3000; i < 3500; ++i) { |
| assertEquals(300 * i, ints.vector[i - 3000]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000)); |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| } |
| |
| /** |
| * Test all of the types that have distinct ORC writers using the vectorized |
| * writer with different combinations of repeating and null values. |
| * @throws Exception |
| */ |
| @Test |
| public void testRepeating() throws Exception { |
| // create a row type with each type that has a unique writer |
| // really just folds short, int, and long together |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("bin", TypeDescription.createBinary()) |
| .addField("bool", TypeDescription.createBoolean()) |
| .addField("byte", TypeDescription.createByte()) |
| .addField("long", TypeDescription.createLong()) |
| .addField("float", TypeDescription.createFloat()) |
| .addField("double", TypeDescription.createDouble()) |
| .addField("date", TypeDescription.createDate()) |
| .addField("time", TypeDescription.createTimestamp()) |
| .addField("dec", TypeDescription.createDecimal() |
| .withPrecision(20).withScale(6)) |
| .addField("string", TypeDescription.createString()) |
| .addField("char", TypeDescription.createChar().withMaxLength(10)) |
| .addField("vc", TypeDescription.createVarchar().withMaxLength(10)) |
| .addField("struct", TypeDescription.createStruct() |
| .addField("sub1", TypeDescription.createInt())) |
| .addField("union", TypeDescription.createUnion() |
| .addUnionChild(TypeDescription.createString()) |
| .addUnionChild(TypeDescription.createInt())) |
| .addField("list", TypeDescription |
| .createList(TypeDescription.createInt())) |
| .addField("map", |
| TypeDescription.createMap(TypeDescription.createString(), |
| TypeDescription.createString())); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| |
| // write 1024 repeating nulls |
| batch.size = 1024; |
| for(int c = 0; c < batch.cols.length; ++c) { |
| batch.cols[c].setRepeating(true); |
| batch.cols[c].noNulls = false; |
| batch.cols[c].isNull[0] = true; |
| } |
| writer.addRowBatch(batch); |
| |
| // write 1024 repeating non-null |
| for(int c =0; c < batch.cols.length; ++c) { |
| batch.cols[c].isNull[0] = false; |
| } |
| ((BytesColumnVector) batch.cols[0]).setVal(0, "Horton".getBytes(StandardCharsets.UTF_8)); |
| ((LongColumnVector) batch.cols[1]).vector[0] = 1; |
| ((LongColumnVector) batch.cols[2]).vector[0] = 130; |
| ((LongColumnVector) batch.cols[3]).vector[0] = 0x123456789abcdef0L; |
| ((DoubleColumnVector) batch.cols[4]).vector[0] = 1.125; |
| ((DoubleColumnVector) batch.cols[5]).vector[0] = 0.0009765625; |
| ((LongColumnVector) batch.cols[6]).vector[0] = |
| new DateWritable(new Date(111, 6, 1)).getDays(); |
| ((TimestampColumnVector) batch.cols[7]).set(0, |
| new Timestamp(115, 9, 23, 10, 11, 59, |
| 999999999)); |
| ((DecimalColumnVector) batch.cols[8]).vector[0] = |
| new HiveDecimalWritable("1.234567"); |
| ((BytesColumnVector) batch.cols[9]).setVal(0, "Echelon".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[10]).setVal(0, "Juggernaut".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[11]).setVal(0, "Dreadnaught".getBytes(StandardCharsets.UTF_8)); |
| ((LongColumnVector) ((StructColumnVector) batch.cols[12]).fields[0]) |
| .vector[0] = 123; |
| ((UnionColumnVector) batch.cols[13]).tags[0] = 1; |
| ((LongColumnVector) ((UnionColumnVector) batch.cols[13]).fields[1]) |
| .vector[0] = 1234; |
| ((ListColumnVector) batch.cols[14]).offsets[0] = 0; |
| ((ListColumnVector) batch.cols[14]).lengths[0] = 3; |
| ((ListColumnVector) batch.cols[14]).child.isRepeating = true; |
| ((LongColumnVector) ((ListColumnVector) batch.cols[14]).child).vector[0] |
| = 31415; |
| ((MapColumnVector) batch.cols[15]).offsets[0] = 0; |
| ((MapColumnVector) batch.cols[15]).lengths[0] = 3; |
| ((MapColumnVector) batch.cols[15]).values.isRepeating = true; |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys) |
| .setVal(0, "ORC".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys) |
| .setVal(1, "Hive".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys) |
| .setVal(2, "LLAP".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).values) |
| .setVal(0, "fast".getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| |
| // write 1024 null without repeat |
| for(int c = 0; c < batch.cols.length; ++c) { |
| batch.cols[c].setRepeating(false); |
| batch.cols[c].noNulls = false; |
| Arrays.fill(batch.cols[c].isNull, true); |
| } |
| writer.addRowBatch(batch); |
| |
| // add 1024 rows of non-null, non-repeating |
| batch.reset(); |
| batch.size = 1024; |
| ((ListColumnVector) batch.cols[14]).child.ensureSize(3 * 1024, false); |
| ((MapColumnVector) batch.cols[15]).keys.ensureSize(3 * 1024, false); |
| ((MapColumnVector) batch.cols[15]).values.ensureSize(3 * 1024, false); |
| for(int r=0; r < 1024; ++r) { |
| ((BytesColumnVector) batch.cols[0]).setVal(r, |
| Integer.toHexString(r).getBytes(StandardCharsets.UTF_8)); |
| ((LongColumnVector) batch.cols[1]).vector[r] = r % 2; |
| ((LongColumnVector) batch.cols[2]).vector[r] = (r % 255); |
| ((LongColumnVector) batch.cols[3]).vector[r] = 31415L * r; |
| ((DoubleColumnVector) batch.cols[4]).vector[r] = 1.125 * r; |
| ((DoubleColumnVector) batch.cols[5]).vector[r] = 0.0009765625 * r; |
| ((LongColumnVector) batch.cols[6]).vector[r] = |
| new DateWritable(new Date(111, 6, 1)).getDays() + r; |
| |
| Timestamp ts = new Timestamp(115, 9, 25, 10, 11, 59 + r, 999999999); |
| ((TimestampColumnVector) batch.cols[7]).set(r, ts); |
| ((DecimalColumnVector) batch.cols[8]).vector[r] = |
| new HiveDecimalWritable("1.234567"); |
| ((BytesColumnVector) batch.cols[9]).setVal(r, |
| Integer.toString(r).getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[10]).setVal(r, |
| Integer.toHexString(r).getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[11]).setVal(r, |
| Integer.toHexString(r * 128).getBytes(StandardCharsets.UTF_8)); |
| ((LongColumnVector) ((StructColumnVector) batch.cols[12]).fields[0]) |
| .vector[r] = r + 13; |
| ((UnionColumnVector) batch.cols[13]).tags[r] = 1; |
| ((LongColumnVector) ((UnionColumnVector) batch.cols[13]).fields[1]) |
| .vector[r] = r + 42; |
| ((ListColumnVector) batch.cols[14]).offsets[r] = 3 * r; |
| ((ListColumnVector) batch.cols[14]).lengths[r] = 3; |
| for(int i=0; i < 3; ++i) { |
| ((LongColumnVector) ((ListColumnVector) batch.cols[14]).child) |
| .vector[3 * r + i] = 31415 + i; |
| } |
| ((MapColumnVector) batch.cols[15]).offsets[r] = 3 * r; |
| ((MapColumnVector) batch.cols[15]).lengths[r] = 3; |
| for(int i=0; i < 3; ++i) { |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).keys) |
| .setVal(3 * r + i, Integer.toHexString(3 * r + i).getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) ((MapColumnVector) batch.cols[15]).values) |
| .setVal(3 * r + i, Integer.toString(3 * r + i).getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| writer.addRowBatch(batch); |
| |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| |
| // check the stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertArrayEquals(stats, writer.getStatistics()); |
| assertEquals(4096, stats[0].getNumberOfValues()); |
| assertEquals(false, stats[0].hasNull()); |
| for(TypeDescription colType: schema.getChildren()) { |
| assertEquals("count on " + colType.getId(), |
| 2048, stats[colType.getId()].getNumberOfValues()); |
| assertEquals("hasNull on " + colType.getId(), |
| true, stats[colType.getId()].hasNull()); |
| } |
| assertEquals(8944, ((BinaryColumnStatistics) stats[1]).getSum()); |
| assertEquals(1536, ((BooleanColumnStatistics) stats[2]).getTrueCount()); |
| assertEquals(512, ((BooleanColumnStatistics) stats[2]).getFalseCount()); |
| assertEquals(false, ((IntegerColumnStatistics) stats[4]).isSumDefined()); |
| assertEquals(0, ((IntegerColumnStatistics) stats[4]).getMinimum()); |
| assertEquals(0x123456789abcdef0L, |
| ((IntegerColumnStatistics) stats[4]).getMaximum()); |
| assertEquals("0", ((StringColumnStatistics) stats[10]).getMinimum()); |
| assertEquals("Echelon", ((StringColumnStatistics) stats[10]).getMaximum()); |
| assertEquals(10154, ((StringColumnStatistics) stats[10]).getSum()); |
| assertEquals("0 ", |
| ((StringColumnStatistics) stats[11]).getMinimum()); |
| assertEquals("ff ", |
| ((StringColumnStatistics) stats[11]).getMaximum()); |
| assertEquals(20480, ((StringColumnStatistics) stats[11]).getSum()); |
| assertEquals("0", |
| ((StringColumnStatistics) stats[12]).getMinimum()); |
| assertEquals("ff80", |
| ((StringColumnStatistics) stats[12]).getMaximum()); |
| assertEquals(14813, ((StringColumnStatistics) stats[12]).getSum()); |
| |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1024); |
| BytesColumnVector bins = (BytesColumnVector) batch.cols[0]; |
| LongColumnVector bools = (LongColumnVector) batch.cols[1]; |
| LongColumnVector bytes = (LongColumnVector) batch.cols[2]; |
| LongColumnVector longs = (LongColumnVector) batch.cols[3]; |
| DoubleColumnVector floats = (DoubleColumnVector) batch.cols[4]; |
| DoubleColumnVector doubles = (DoubleColumnVector) batch.cols[5]; |
| LongColumnVector dates = (LongColumnVector) batch.cols[6]; |
| TimestampColumnVector times = (TimestampColumnVector) batch.cols[7]; |
| DecimalColumnVector decs = (DecimalColumnVector) batch.cols[8]; |
| BytesColumnVector strs = (BytesColumnVector) batch.cols[9]; |
| BytesColumnVector chars = (BytesColumnVector) batch.cols[10]; |
| BytesColumnVector vcs = (BytesColumnVector) batch.cols[11]; |
| StructColumnVector structs = (StructColumnVector) batch.cols[12]; |
| UnionColumnVector unions = (UnionColumnVector) batch.cols[13]; |
| ListColumnVector lists = (ListColumnVector) batch.cols[14]; |
| MapColumnVector maps = (MapColumnVector) batch.cols[15]; |
| LongColumnVector structInts = (LongColumnVector) structs.fields[0]; |
| LongColumnVector unionInts = (LongColumnVector) unions.fields[1]; |
| LongColumnVector listInts = (LongColumnVector) lists.child; |
| BytesColumnVector mapKeys = (BytesColumnVector) maps.keys; |
| BytesColumnVector mapValues = (BytesColumnVector) maps.values; |
| |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| |
| // read the 1024 nulls |
| for(int f=0; f < batch.cols.length; ++f) { |
| assertEquals("field " + f, |
| true, batch.cols[f].isRepeating); |
| assertEquals("field " + f, |
| false, batch.cols[f].noNulls); |
| assertEquals("field " + f, |
| true, batch.cols[f].isNull[0]); |
| } |
| |
| // read the 1024 repeat values |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| assertEquals("row " + r, "Horton", bins.toString(r)); |
| assertEquals("row " + r, 1, bools.vector[r]); |
| assertEquals("row " + r, -126, bytes.vector[r]); |
| assertEquals("row " + r, 1311768467463790320L, longs.vector[r]); |
| assertEquals("row " + r, 1.125, floats.vector[r], 0.00001); |
| assertEquals("row " + r, 9.765625E-4, doubles.vector[r], 0.000001); |
| assertEquals("row " + r, "2011-07-01", |
| new DateWritable((int) dates.vector[r]).toString()); |
| assertEquals("row " + r, "2015-10-23 10:11:59.999999999", |
| times.asScratchTimestamp(r).toString()); |
| assertEquals("row " + r, "1.234567", decs.vector[r].toString()); |
| assertEquals("row " + r, "Echelon", strs.toString(r)); |
| assertEquals("row " + r, "Juggernaut", chars.toString(r)); |
| assertEquals("row " + r, "Dreadnaugh", vcs.toString(r)); |
| assertEquals("row " + r, 123, structInts.vector[r]); |
| assertEquals("row " + r, 1, unions.tags[r]); |
| assertEquals("row " + r, 1234, unionInts.vector[r]); |
| assertEquals("row " + r, 3, lists.lengths[r]); |
| assertEquals("row " + r, true, listInts.isRepeating); |
| assertEquals("row " + r, 31415, listInts.vector[0]); |
| assertEquals("row " + r, 3, maps.lengths[r]); |
| assertEquals("row " + r, "ORC", mapKeys.toString((int) maps.offsets[r])); |
| assertEquals("row " + r, "Hive", mapKeys.toString((int) maps.offsets[r] + 1)); |
| assertEquals("row " + r, "LLAP", mapKeys.toString((int) maps.offsets[r] + 2)); |
| assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r])); |
| assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r] + 1)); |
| assertEquals("row " + r, "fast", mapValues.toString((int) maps.offsets[r] + 2)); |
| } |
| |
| // read the second set of 1024 nulls |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int f=0; f < batch.cols.length; ++f) { |
| assertEquals("field " + f, |
| true, batch.cols[f].isRepeating); |
| assertEquals("field " + f, |
| false, batch.cols[f].noNulls); |
| assertEquals("field " + f, |
| true, batch.cols[f].isNull[0]); |
| } |
| |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| String hex = Integer.toHexString(r); |
| |
| assertEquals("row " + r, hex, bins.toString(r)); |
| assertEquals("row " + r, r % 2 == 1 ? 1 : 0, bools.vector[r]); |
| assertEquals("row " + r, (byte) (r % 255), bytes.vector[r]); |
| assertEquals("row " + r, 31415L * r, longs.vector[r]); |
| assertEquals("row " + r, 1.125F * r, floats.vector[r], 0.0001); |
| assertEquals("row " + r, 0.0009765625 * r, doubles.vector[r], 0.000001); |
| assertEquals("row " + r, new DateWritable(new Date(111, 6, 1 + r)), |
| new DateWritable((int) dates.vector[r])); |
| assertEquals("row " + r, |
| new Timestamp(115, 9, 25, 10, 11, 59 + r, 999999999), |
| times.asScratchTimestamp(r)); |
| assertEquals("row " + r, "1.234567", decs.vector[r].toString()); |
| assertEquals("row " + r, Integer.toString(r), strs.toString(r)); |
| assertEquals("row " + r, Integer.toHexString(r), chars.toString(r)); |
| assertEquals("row " + r, Integer.toHexString(r * 128), vcs.toString(r)); |
| assertEquals("row " + r, r + 13, structInts.vector[r]); |
| assertEquals("row " + r, 1, unions.tags[r]); |
| assertEquals("row " + r, r + 42, unionInts.vector[r]); |
| assertEquals("row " + r, 3, lists.lengths[r]); |
| assertEquals("row " + r, 31415, listInts.vector[(int) lists.offsets[r]]); |
| assertEquals("row " + r, 31416, listInts.vector[(int) lists.offsets[r] + 1]); |
| assertEquals("row " + r, 31417, listInts.vector[(int) lists.offsets[r] + 2]); |
| assertEquals("row " + r, 3, maps.lengths[3]); |
| assertEquals("row " + r, Integer.toHexString(3 * r), mapKeys.toString((int) maps.offsets[r])); |
| assertEquals("row " + r, Integer.toString(3 * r), mapValues.toString((int) maps.offsets[r])); |
| assertEquals("row " + r, Integer.toHexString(3 * r + 1), mapKeys.toString((int) maps.offsets[r] + 1)); |
| assertEquals("row " + r, Integer.toString(3 * r + 1), mapValues.toString((int) maps.offsets[r] + 1)); |
| assertEquals("row " + r, Integer.toHexString(3 * r + 2), mapKeys.toString((int) maps.offsets[r] + 2)); |
| assertEquals("row " + r, Integer.toString(3 * r + 2), mapValues.toString((int) maps.offsets[r] + 2)); |
| } |
| |
| // should have no more rows |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| } |
| |
| private static String makeString(BytesColumnVector vector, int row) { |
| if (vector.isRepeating) { |
| row = 0; |
| } |
| if (vector.noNulls || !vector.isNull[row]) { |
| return new String(vector.vector[row], vector.start[row], |
| vector.length[row], StandardCharsets.UTF_8); |
| } else { |
| return null; |
| } |
| } |
| |
| /** |
| * Test the char and varchar padding and truncation. |
| * @throws Exception |
| */ |
| @Test |
| public void testStringPadding() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("char", TypeDescription.createChar().withMaxLength(10)) |
| .addField("varchar", TypeDescription.createVarchar().withMaxLength(10)); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 4; |
| for(int c=0; c < batch.cols.length; ++c) { |
| ((BytesColumnVector) batch.cols[c]).setVal(0, "".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[c]).setVal(1, "xyz".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[c]).setVal(2, "0123456789".getBytes(StandardCharsets.UTF_8)); |
| ((BytesColumnVector) batch.cols[c]).setVal(3, |
| "0123456789abcdef".getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(4, batch.size); |
| // ORC currently trims the output strings. See HIVE-12286 |
| assertEquals("", |
| makeString((BytesColumnVector) batch.cols[0], 0)); |
| assertEquals("xyz", |
| makeString((BytesColumnVector) batch.cols[0], 1)); |
| assertEquals("0123456789", |
| makeString((BytesColumnVector) batch.cols[0], 2)); |
| assertEquals("0123456789", |
| makeString((BytesColumnVector) batch.cols[0], 3)); |
| assertEquals("", |
| makeString((BytesColumnVector) batch.cols[1], 0)); |
| assertEquals("xyz", |
| makeString((BytesColumnVector) batch.cols[1], 1)); |
| assertEquals("0123456789", |
| makeString((BytesColumnVector) batch.cols[1], 2)); |
| assertEquals("0123456789", |
| makeString((BytesColumnVector) batch.cols[1], 3)); |
| } |
| |
| /** |
| * A test case that tests the case where you add a repeating batch |
| * to a column that isn't using dictionary encoding. |
| * @throws Exception |
| */ |
| @Test |
| public void testNonDictionaryRepeatingString() throws Exception { |
| Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("str", TypeDescription.createString()); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| for(int r=0; r < batch.size; ++r) { |
| ((BytesColumnVector) batch.cols[0]).setVal(r, |
| Integer.toString(r * 10001).getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| batch.cols[0].isRepeating = true; |
| ((BytesColumnVector) batch.cols[0]).setVal(0, "Halloween".getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| assertEquals(Integer.toString(r * 10001), |
| makeString((BytesColumnVector) batch.cols[0], r)); |
| } |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| assertEquals("Halloween", |
| makeString((BytesColumnVector) batch.cols[0], r)); |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| } |
| |
| @Test |
| public void testStructs() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("struct", TypeDescription.createStruct() |
| .addField("inner", TypeDescription.createLong())); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| StructColumnVector outer = (StructColumnVector) batch.cols[0]; |
| outer.noNulls = false; |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200 || (r >= 400 && r < 600) || r >= 800) { |
| outer.isNull[r] = true; |
| } |
| ((LongColumnVector) outer.fields[0]).vector[r] = r; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| rows.nextBatch(batch); |
| assertEquals(1024, batch.size); |
| StructColumnVector inner = (StructColumnVector) batch.cols[0]; |
| LongColumnVector vec = (LongColumnVector) inner.fields[0]; |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200 || (r >= 400 && r < 600) || r >= 800) { |
| assertEquals("row " + r, true, inner.isNull[r]); |
| } else { |
| assertEquals("row " + r, false, inner.isNull[r]); |
| assertEquals("row " + r, r, vec.vector[r]); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| } |
| |
| /** |
| * Test Unions. |
| * @throws Exception |
| */ |
| @Test |
| public void testUnions() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("outer", TypeDescription.createUnion() |
| .addUnionChild(TypeDescription.createInt()) |
| .addUnionChild(TypeDescription.createLong())); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| UnionColumnVector outer = (UnionColumnVector) batch.cols[0]; |
| batch.cols[0].noNulls = false; |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200) { |
| outer.isNull[r] = true; |
| } else if (r < 300) { |
| outer.tags[r] = 0; |
| } else if (r < 400) { |
| outer.tags[r] = 1; |
| } else if (r < 600) { |
| outer.isNull[r] = true; |
| } else if (r < 800) { |
| outer.tags[r] = 1; |
| } else if (r < 1000) { |
| outer.isNull[r] = true; |
| } else { |
| outer.tags[r] = 1; |
| } |
| ((LongColumnVector) outer.fields[0]).vector[r] = r; |
| ((LongColumnVector) outer.fields[1]).vector[r] = -r; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1024); |
| UnionColumnVector union = (UnionColumnVector) batch.cols[0]; |
| LongColumnVector ints = (LongColumnVector) union.fields[0]; |
| LongColumnVector longs = (LongColumnVector) union.fields[1]; |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200) { |
| assertEquals("row " + r, true, union.isNull[r]); |
| } else if (r < 300) { |
| assertEquals("row " + r, false, union.isNull[r]); |
| assertEquals("row " + r, 0, union.tags[r]); |
| assertEquals("row " + r, r, ints.vector[r]); |
| } else if (r < 400) { |
| assertEquals("row " + r, false, union.isNull[r]); |
| assertEquals("row " + r, 1, union.tags[r]); |
| assertEquals("row " + r, -r, longs.vector[r]); |
| } else if (r < 600) { |
| assertEquals("row " + r, true, union.isNull[r]); |
| } else if (r < 800) { |
| assertEquals("row " + r, false, union.isNull[r]); |
| assertEquals("row " + r, 1, union.tags[r]); |
| assertEquals("row " + r, -r, longs.vector[r]); |
| } else if (r < 1000) { |
| assertEquals("row " + r, true, union.isNull[r]); |
| } else { |
| assertEquals("row " + r, false, union.isNull[r]); |
| assertEquals("row " + r, 1, union.tags[r]); |
| assertEquals("row " + r, -r, longs.vector[r]); |
| } |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| } |
| |
| /** |
| * Test lists and how they interact with the child column. In particular, |
| * put nulls between back to back lists and then make some lists that |
| * oper lap. |
| * @throws Exception |
| */ |
| @Test |
| public void testLists() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("list", |
| TypeDescription.createList(TypeDescription.createLong())); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| ListColumnVector list = (ListColumnVector) batch.cols[0]; |
| list.noNulls = false; |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200) { |
| list.isNull[r] = true; |
| } else if (r < 300) { |
| list.offsets[r] = r - 200; |
| list.lengths[r] = 1; |
| } else if (r < 400) { |
| list.isNull[r] = true; |
| } else if (r < 500) { |
| list.offsets[r] = r - 300; |
| list.lengths[r] = 1; |
| } else if (r < 600) { |
| list.isNull[r] = true; |
| } else if (r < 700) { |
| list.offsets[r] = r; |
| list.lengths[r] = 2; |
| } else { |
| list.isNull[r] = true; |
| } |
| ((LongColumnVector) list.child).vector[r] = r * 10; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(1024); |
| list = (ListColumnVector) batch.cols[0]; |
| rows.nextBatch(batch); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| StringBuilder actual = new StringBuilder(); |
| list.stringifyValue(actual, r); |
| if (r < 200) { |
| assertEquals("row " + r, "null", actual.toString()); |
| } else if (r < 300) { |
| assertEquals("row " + r, "[" + ((r - 200) * 10) + "]", |
| actual.toString()); |
| } else if (r < 400) { |
| assertEquals("row " + r, "null", actual.toString()); |
| } else if (r < 500) { |
| assertEquals("row " + r, "[" + ((r - 300) * 10) + "]", |
| actual.toString()); |
| } else if (r < 600) { |
| assertEquals("row " + r, "null", actual.toString()); |
| } else if (r < 700) { |
| assertEquals("row " + r, "[" + (10 * r) + ", " + (10 * (r + 1)) + "]", |
| actual.toString()); |
| } else { |
| assertEquals("row " + r, "null", actual.toString()); |
| } |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| } |
| |
| /** |
| * Test maps and how they interact with the child column. In particular, |
| * put nulls between back to back lists and then make some lists that |
| * oper lap. |
| * @throws Exception |
| */ |
| @Test |
| public void testMaps() throws Exception { |
| TypeDescription schema = TypeDescription.createStruct() |
| .addField("map", |
| TypeDescription.createMap(TypeDescription.createLong(), |
| TypeDescription.createLong())); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| MapColumnVector map = (MapColumnVector) batch.cols[0]; |
| map.noNulls = false; |
| for(int r=0; r < 1024; ++r) { |
| if (r < 200) { |
| map.isNull[r] = true; |
| } else if (r < 300) { |
| map.offsets[r] = r - 200; |
| map.lengths[r] = 1; |
| } else if (r < 400) { |
| map.isNull[r] = true; |
| } else if (r < 500) { |
| map.offsets[r] = r - 300; |
| map.lengths[r] = 1; |
| } else if (r < 600) { |
| map.isNull[r] = true; |
| } else if (r < 700) { |
| map.offsets[r] = r; |
| map.lengths[r] = 2; |
| } else { |
| map.isNull[r] = true; |
| } |
| ((LongColumnVector) map.keys).vector[r] = r; |
| ((LongColumnVector) map.values).vector[r] = r * 10; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| map = (MapColumnVector) batch.cols[0]; |
| rows.nextBatch(batch); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < 1024; ++r) { |
| StringBuilder buffer = new StringBuilder(); |
| map.stringifyValue(buffer, r); |
| String actual = buffer.toString(); |
| if (r < 200) { |
| assertEquals("row " + r, "null", actual); |
| } else if (r < 300) { |
| assertEquals("row " + r, "[{\"key\": " + (r - 200) + |
| ", \"value\": " + ((r - 200) * 10) + "}]", |
| actual); |
| } else if (r < 400) { |
| assertEquals("row " + r, "null", actual); |
| } else if (r < 500) { |
| assertEquals("row " + r, "[{\"key\": " + (r - 300) + |
| ", \"value\": " + ((r - 300) * 10) + "}]", actual); |
| } else if (r < 600) { |
| assertEquals("row " + r, "null", actual); |
| } else if (r < 700) { |
| assertEquals("row " + r, "[{\"key\": " + r + ", \"value\": " + (r * 10) |
| + "}, {\"key\": " + (r + 1) + ", \"value\": " + (10 * (r + 1)) |
| + "}]", actual); |
| } else { |
| assertEquals("row " + r, "null", actual); |
| } |
| } |
| rows.nextBatch(batch); |
| assertEquals(0, batch.size); |
| } |
| |
| @Test |
| public void testExpansion() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString( |
| "struct<list1:array<string>," + |
| "list2:array<binary>>"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 2; |
| ListColumnVector list1 = (ListColumnVector) batch.cols[0]; |
| BytesColumnVector str = (BytesColumnVector) list1.child; |
| str.ensureSize(6000, false); |
| ListColumnVector list2 = (ListColumnVector) batch.cols[1]; |
| BytesColumnVector bin = (BytesColumnVector) list2.child; |
| bin.ensureSize(6000, false); |
| list1.offsets[0] = 0; |
| list1.lengths[0] = 2000; |
| list2.offsets[1] = 2000; |
| list2.lengths[1] = 3000; |
| for(int v=0; v < 5000; ++v) { |
| byte[] bytes = Long.toHexString(v).getBytes(StandardCharsets.UTF_8); |
| str.setVal(v, bytes); |
| bin.setVal(v, bytes); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(2, batch.size); |
| assertFalse(rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| @Test |
| public void testWriterVersion() throws Exception { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| |
| // test writer implementation serialization |
| assertEquals(OrcFile.WriterImplementation.ORC_JAVA, |
| OrcFile.WriterImplementation.from(0)); |
| assertEquals(OrcFile.WriterImplementation.ORC_CPP, |
| OrcFile.WriterImplementation.from(1)); |
| assertEquals(OrcFile.WriterImplementation.PRESTO, |
| OrcFile.WriterImplementation.from(2)); |
| assertEquals(OrcFile.WriterImplementation.UNKNOWN, |
| OrcFile.WriterImplementation.from(99)); |
| |
| // test writer version serialization |
| assertEquals(OrcFile.WriterVersion.FUTURE, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 99)); |
| assertEquals(OrcFile.WriterVersion.ORIGINAL, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 0)); |
| assertEquals(OrcFile.WriterVersion.HIVE_4243, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, 2)); |
| assertEquals(OrcFile.WriterVersion.FUTURE, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_CPP, 99)); |
| assertEquals(OrcFile.WriterVersion.ORC_CPP_ORIGINAL, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_CPP, 6)); |
| assertEquals(OrcFile.WriterVersion.PRESTO_ORIGINAL, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 6)); |
| assertEquals(OrcFile.WriterVersion.FUTURE, |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.UNKNOWN, 0)); |
| |
| // test compatibility |
| assertTrue(OrcFile.WriterVersion.FUTURE.includes( |
| OrcFile.WriterVersion.ORC_CPP_ORIGINAL)); |
| assertTrue(OrcFile.WriterVersion.FUTURE.includes( |
| OrcFile.WriterVersion.HIVE_8732)); |
| assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( |
| OrcFile.WriterVersion.HIVE_4243)); |
| assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( |
| OrcFile.WriterVersion.HIVE_12055)); |
| assertTrue(!OrcFile.WriterVersion.HIVE_4243.includes( |
| OrcFile.WriterVersion.HIVE_12055)); |
| assertTrue(OrcFile.WriterVersion.HIVE_12055.includes( |
| OrcFile.WriterVersion.PRESTO_ORIGINAL)); |
| } |
| |
| @Test(expected=IllegalArgumentException.class) |
| public void testBadPrestoVersion() { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| OrcFile.WriterVersion.from(OrcFile.WriterImplementation.PRESTO, 0); |
| } |
| |
| /** |
| * Test whether the file versions are translated correctly |
| * @throws Exception |
| */ |
| @Test |
| public void testFileVersion() throws Exception { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(null)); |
| assertEquals(OrcFile.Version.V_0_11, ReaderImpl.getFileVersion(new ArrayList<Integer>())); |
| assertEquals(OrcFile.Version.V_0_11, |
| ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 11}))); |
| assertEquals(OrcFile.Version.V_0_12, |
| ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{0, 12}))); |
| assertEquals(OrcFile.Version.FUTURE, |
| ReaderImpl.getFileVersion(Arrays.asList(new Integer[]{9999, 0}))); |
| } |
| |
| @Test |
| public void testMergeUnderstood() throws Exception { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| Path p = new Path("test.orc"); |
| Reader futureVersion = Mockito.mock(Reader.class); |
| Mockito.when(futureVersion.getFileVersion()).thenReturn(OrcFile.Version.FUTURE); |
| Mockito.when(futureVersion.getWriterVersion()).thenReturn(OrcFile.WriterVersion.HIVE_4243); |
| assertEquals(false, OrcFile.understandFormat(p, futureVersion)); |
| Reader futureWriter = Mockito.mock(Reader.class); |
| Mockito.when(futureWriter.getFileVersion()).thenReturn(OrcFile.Version.V_0_11); |
| Mockito.when(futureWriter.getWriterVersion()).thenReturn(OrcFile.WriterVersion.FUTURE); |
| assertEquals(false, OrcFile.understandFormat(p, futureWriter)); |
| Reader current = Mockito.mock(Reader.class); |
| Mockito.when(current.getFileVersion()).thenReturn(OrcFile.Version.CURRENT); |
| Mockito.when(current.getWriterVersion()).thenReturn(OrcFile.CURRENT_WRITER); |
| assertEquals(true, OrcFile.understandFormat(p, current)); |
| } |
| |
| static ByteBuffer fromString(String s) { |
| return ByteBuffer.wrap(s.getBytes(StandardCharsets.UTF_8)); |
| } |
| |
| static byte[] fromLong(long x) { |
| return Long.toHexString(x).getBytes(StandardCharsets.UTF_8); |
| } |
| |
| @Test |
| public void testMerge() throws Exception { |
| Path input1 = new Path(workDir, "TestVectorOrcFile.testMerge1-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(input1, false); |
| Path input2 = new Path(workDir, "TestVectorOrcFile.testMerge2-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(input2, false); |
| Path input3 = new Path(workDir, "TestVectorOrcFile.testMerge3-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(input3, false); |
| TypeDescription schema = TypeDescription.fromString("struct<a:int,b:string>"); |
| // change all of the options away from default to find anything we |
| // don't copy to the merged file |
| OrcFile.WriterOptions opts = OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .compress(CompressionKind.LZO) |
| .enforceBufferSize() |
| .bufferSize(20*1024) |
| .rowIndexStride(1000) |
| .version(fileFormat) |
| .writerVersion(OrcFile.WriterVersion.HIVE_8732); |
| |
| Writer writer = OrcFile.createWriter(input1, opts); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 1024; |
| for(int r=0; r < 1024; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = r; |
| ((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(r)); |
| } |
| writer.addRowBatch(batch); |
| writer.addUserMetadata("a", fromString("foo")); |
| writer.addUserMetadata("b", fromString("bar")); |
| writer.close(); |
| |
| // increase the buffer size to 30k |
| opts.bufferSize(30*1024); |
| writer = OrcFile.createWriter(input2, opts); |
| batch.size = 1024; |
| for(int r=0; r < 1024; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = 2 * r; |
| ((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(2 * r)); |
| } |
| writer.addRowBatch(batch); |
| writer.addUserMetadata("a", fromString("foo")); |
| writer.addUserMetadata("c", fromString("baz")); |
| writer.close(); |
| |
| // decrease the buffer size to 10k |
| opts.bufferSize(10*1024); |
| writer = OrcFile.createWriter(input3, opts); |
| batch.size = 1024; |
| for(int r=0; r < 1024; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = 3 * r; |
| ((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(3 * r)); |
| } |
| writer.addRowBatch(batch); |
| writer.addUserMetadata("c", fromString("baz")); |
| writer.addUserMetadata("d", fromString("bat")); |
| writer.close(); |
| |
| Path output1 = new Path(workDir, "TestVectorOrcFile.testMerge.out1-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(output1, false); |
| List<Path> paths = OrcFile.mergeFiles(output1, |
| OrcFile.writerOptions(conf), Arrays.asList(input1, input2, input3)); |
| assertEquals(3, paths.size()); |
| Reader reader = OrcFile.createReader(output1, OrcFile.readerOptions(conf)); |
| assertEquals(3 * 1024, reader.getNumberOfRows()); |
| assertEquals(CompressionKind.LZO, reader.getCompressionKind()); |
| assertEquals(30 * 1024, reader.getCompressionSize()); |
| assertEquals(1000, reader.getRowIndexStride()); |
| assertEquals(fileFormat, reader.getFileVersion()); |
| assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion()); |
| assertEquals(3, reader.getStripes().size()); |
| assertEquals(4, reader.getMetadataKeys().size()); |
| assertEquals(fromString("foo"), reader.getMetadataValue("a")); |
| assertEquals(fromString("bar"), reader.getMetadataValue("b")); |
| assertEquals(fromString("baz"), reader.getMetadataValue("c")); |
| assertEquals(fromString("bat"), reader.getMetadataValue("d")); |
| |
| TypeDescription schema4 = TypeDescription.fromString("struct<a:int>"); |
| Path input4 = new Path(workDir, "TestVectorOrcFile.testMerge4-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(input4, false); |
| opts.setSchema(schema4); |
| writer = OrcFile.createWriter(input4, opts); |
| batch = schema4.createRowBatch(); |
| batch.size = 1024; |
| for(int r=0; r < 1024; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = 4 * r; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Path input5 = new Path(workDir, "TestVectorOrcFile.testMerge5-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(input5, false); |
| opts.setSchema(schema) |
| .compress(CompressionKind.NONE) |
| .bufferSize(100*1024); |
| writer = OrcFile.createWriter(input5, opts); |
| batch = schema.createRowBatch(); |
| batch.size = 1024; |
| for(int r=0; r < 1024; ++r) { |
| ((LongColumnVector) batch.cols[0]).vector[r] = 4 * r; |
| ((BytesColumnVector) batch.cols[1]).setVal(r, fromLong(5 * r)); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| Path output2 = new Path(workDir, "TestVectorOrcFile.testMerge.out2-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(output2, false); |
| paths = OrcFile.mergeFiles(output2, OrcFile.writerOptions(conf), |
| Arrays.asList(input3, input4, input1, input5)); |
| assertEquals(2, paths.size()); |
| reader = OrcFile.createReader(output2, OrcFile.readerOptions(conf)); |
| assertEquals(2 * 1024, reader.getNumberOfRows()); |
| assertEquals(CompressionKind.LZO, reader.getCompressionKind()); |
| assertEquals(20 * 1024, reader.getCompressionSize()); |
| assertEquals(1000, reader.getRowIndexStride()); |
| assertEquals(fileFormat, reader.getFileVersion()); |
| assertEquals(OrcFile.WriterVersion.HIVE_8732, reader.getWriterVersion()); |
| assertEquals(2, reader.getStripes().size()); |
| assertEquals(4, reader.getMetadataKeys().size()); |
| assertEquals(fromString("foo"), reader.getMetadataValue("a")); |
| assertEquals(fromString("bar"), reader.getMetadataValue("b")); |
| assertEquals(fromString("baz"), reader.getMetadataValue("c")); |
| assertEquals(fromString("bat"), reader.getMetadataValue("d")); |
| } |
| |
| /** |
| * Write a mergeable file to test merging files with column encryption. |
| * @param path the path to write to |
| * @param provider the key provider |
| * @param startValue the base value for the columns |
| * @param stripes the number of stripes to write |
| * @param bufferSize the buffer size to use for the compression |
| * @param encrypt the encryption string |
| * @param mask the mask string |
| * @return the locations of the intermediate stripes |
| * @throws IOException |
| */ |
| private long[] writeMergeableFile(Path path, |
| KeyProvider provider, |
| long startValue, |
| int stripes, |
| int bufferSize, |
| String encrypt, |
| String mask) throws IOException { |
| fs.delete(path, false); |
| TypeDescription schema = TypeDescription.fromString( |
| "struct<a:int,b:struct<c:string,d:string>>"); |
| |
| // change all of the options away from default to find anything we |
| // don't copy to the merged file |
| OrcFile.WriterOptions opts = OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .rowIndexStride(1000) |
| .version(fileFormat) |
| .bufferSize(bufferSize) |
| .enforceBufferSize() |
| .setKeyProvider(provider) |
| .encrypt(encrypt) |
| .masks(mask); |
| long[] intermediateFooters = new long[stripes]; |
| |
| Writer writer = OrcFile.createWriter(path, opts); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| LongColumnVector a = (LongColumnVector) batch.cols[0]; |
| StructColumnVector b = (StructColumnVector) batch.cols[1]; |
| BytesColumnVector c = (BytesColumnVector) b.fields[0]; |
| BytesColumnVector d = (BytesColumnVector) b.fields[1]; |
| batch.size = 1024; |
| for(int btch=0; btch < 3; ++btch) { |
| for (int r = 0; r < 1024; ++r) { |
| long value = startValue + btch * 1024 + r; |
| a.vector[r] = value; |
| c.setVal(r, fromLong(value)); |
| d.setVal(r, String.format("%010x", value * 1_000_001) |
| .getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| // write an intermediate footer to force a stripe |
| intermediateFooters[btch] = writer.writeIntermediateFooter(); |
| } |
| writer.close(); |
| return intermediateFooters; |
| } |
| |
| static String computeSha(String value) { |
| try { |
| MessageDigest md = MessageDigest.getInstance("SHA-256"); |
| byte[] digest = md.digest(value.getBytes(StandardCharsets.UTF_8)); |
| return DatatypeConverter.printHexBinary(digest); |
| } catch (NoSuchAlgorithmException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| @Test |
| public void testEncryptMerge() throws Exception { |
| Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); |
| Path input1 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge1-" + |
| fileFormat.getName() + ".orc"); |
| Path input2 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge2-" + |
| fileFormat.getName() + ".orc"); |
| Path input3 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge3-" + |
| fileFormat.getName() + ".orc"); |
| Path input4 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge4-" + |
| fileFormat.getName() + ".orc"); |
| Path input5 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge5-" + |
| fileFormat.getName() + ".orc"); |
| Random random = new Random(169); |
| InMemoryKeystore keystore = new InMemoryKeystore(random); |
| EncryptionAlgorithm algorithm = EncryptionAlgorithm.AES_CTR_128; |
| byte[] piiKey = new byte[algorithm.keyLength()]; |
| byte[] topSecretKey = new byte[algorithm.keyLength()]; |
| random.nextBytes(piiKey); |
| random.nextBytes(topSecretKey); |
| keystore.addKey("pii", algorithm, piiKey) |
| .addKey("top_secret", algorithm, topSecretKey); |
| String encryption = "pii:a;top_secret:b"; |
| String mask = "sha256,`don't worry`:b"; |
| |
| // write three files that should merge, each with 3 stripes of 1024 rows. |
| long[] cuts = writeMergeableFile(input1, keystore, 0, 3, 0x400, encryption, mask); |
| writeMergeableFile(input2, keystore, 3 * 1024, 3, 0x800, encryption, mask); |
| writeMergeableFile(input3, keystore, 6 * 1024, 3, 0xc00, encryption, mask); |
| // two files that aren't mergeable |
| writeMergeableFile(input4, keystore, 9 * 1024, 3, 0x400, encryption, null); |
| writeMergeableFile(input5, keystore, 12 * 1024, 3, 0x400, null, null); |
| |
| // make sure that we can read up to the intermediate footers |
| try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf) |
| .maxLength(cuts[0]))) { |
| assertEquals(1024, reader.getNumberOfRows()); |
| } |
| try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf) |
| .maxLength(cuts[1]))) { |
| assertEquals(2 * 1024, reader.getNumberOfRows()); |
| } |
| try (Reader reader = OrcFile.createReader(input1, OrcFile.readerOptions(conf) |
| .maxLength(cuts[2]))) { |
| assertEquals(3 * 1024, reader.getNumberOfRows()); |
| } |
| |
| // make a new version of the pii key |
| keystore.addKey("pii", 1, algorithm, new byte[algorithm.keyLength()]); |
| Path merge1 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge.merge1-" + |
| fileFormat.getName() + ".orc"); |
| |
| // merge all three files together |
| fs.delete(merge1, false); |
| List<Path> paths = OrcFile.mergeFiles(merge1, |
| OrcFile.writerOptions(conf).setKeyProvider(keystore), |
| Arrays.asList(input1, input2, input3)); |
| assertEquals(3, paths.size()); |
| |
| // test reading with no keys |
| Reader reader = OrcFile.createReader(merge1, OrcFile.readerOptions(conf)); |
| assertEquals(9 * 1024, reader.getNumberOfRows()); |
| assertEquals(CompressionKind.ZLIB, reader.getCompressionKind()); |
| assertEquals(1000, reader.getRowIndexStride()); |
| assertEquals(0xc00, reader.getCompressionSize()); |
| assertEquals(fileFormat, reader.getFileVersion()); |
| assertEquals(9, reader.getStripes().size()); |
| EncryptionKey[] keys = reader.getColumnEncryptionKeys(); |
| assertEquals(2, keys.length); |
| assertEquals("pii", keys[0].getKeyName()); |
| assertEquals(0, keys[0].getKeyVersion()); |
| assertEquals(false, keys[0].isAvailable()); |
| assertEquals("top_secret", keys[1].getKeyName()); |
| assertEquals(0, keys[1].getKeyVersion()); |
| assertEquals(false, keys[1].isAvailable()); |
| // check the file stats |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertEquals(9 * 1024, stats[0].getNumberOfValues()); |
| assertEquals(0, stats[1].getNumberOfValues()); |
| assertEquals(9 * 1024, stats[2].getNumberOfValues()); |
| assertEquals(9 * 1024, stats[3].getNumberOfValues()); |
| assertEquals("00037F39CF870A1F49129F9C82D935665D352FFD25EA3296208F6F7B16FD654F", |
| ((StringColumnStatistics) stats[3]).getMinimum()); |
| assertEquals("FFF60CF25C8E227396BC77DD808773DA69D767D6B0417ADB1A0CAC51CC168797", |
| ((StringColumnStatistics) stats[3]).getMaximum()); |
| assertEquals(9 * 1024, stats[4].getNumberOfValues()); |
| assertEquals("001277C7986C02D9CDA490756055C6A81F3838D3394F18806DD3359AAD59862A", |
| ((StringColumnStatistics) stats[4]).getMinimum()); |
| assertEquals("FFFF1E62E46263E623F704AC22C2F27E5BBDED8693546A2A11F011251A53D23D", |
| ((StringColumnStatistics) stats[4]).getMaximum()); |
| // check the stripe stats |
| List<StripeStatistics> stripeStats = reader.getStripeStatistics(); |
| for(int s=0; s < stripeStats.size(); ++s) { |
| ColumnStatistics[] cs = stripeStats.get(s).getColumnStatistics(); |
| assertEquals("stripe " + s, 1024, cs[0].getNumberOfValues()); |
| assertEquals("stripe " + s, 0, cs[1].getNumberOfValues()); |
| assertEquals("stripe " + s, 1024, cs[2].getNumberOfValues()); |
| assertEquals("stripe " + s, 1024, cs[3].getNumberOfValues()); |
| assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[3]).getMinimum().length()); |
| assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[3]).getMaximum().length()); |
| assertEquals("stripe " + s, 1024, cs[4].getNumberOfValues()); |
| assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[4]).getMinimum().length()); |
| assertEquals("stripe " + s, 64, ((StringColumnStatistics) cs[4]).getMaximum().length()); |
| } |
| // check the file contents |
| RecordReader rows = reader.rows(); |
| VectorizedRowBatch batch = reader.getSchema().createRowBatchV2(); |
| LongColumnVector a = (LongColumnVector) batch.cols[0]; |
| StructColumnVector b = (StructColumnVector) batch.cols[1]; |
| BytesColumnVector c = (BytesColumnVector) b.fields[0]; |
| BytesColumnVector d = (BytesColumnVector) b.fields[1]; |
| for(int btch=0; btch < 9; ++btch) { |
| assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < batch.size; ++r) { |
| long value = btch * 1024 + r; |
| assertEquals("batch " + btch + " row " + r, true, a.isNull[r]); |
| assertEquals("batch " + btch + " row " + r, |
| computeSha(Long.toHexString(value)), c.toString(r)); |
| assertEquals("batch " + btch + " row " + r, |
| computeSha(String.format("%010x", value * 1_000_001)), |
| d.toString(r)); |
| } |
| } |
| assertEquals(false, rows.nextBatch(batch)); |
| rows.close(); |
| reader.close(); |
| |
| // test reading with keys |
| reader = OrcFile.createReader(merge1, |
| OrcFile.readerOptions(conf).setKeyProvider(keystore)); |
| assertEquals(9 * 1024, reader.getNumberOfRows()); |
| keys = reader.getColumnEncryptionKeys(); |
| assertEquals(2, keys.length); |
| assertEquals("pii", keys[0].getKeyName()); |
| assertEquals(0, keys[0].getKeyVersion()); |
| assertEquals(true, keys[0].isAvailable()); |
| assertEquals("top_secret", keys[1].getKeyName()); |
| assertEquals(0, keys[1].getKeyVersion()); |
| assertEquals(true, keys[1].isAvailable()); |
| // check the file stats |
| stats = reader.getStatistics(); |
| assertEquals(9 * 1024, stats[0].getNumberOfValues()); |
| assertEquals(9 * 1024, stats[1].getNumberOfValues()); |
| assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum()); |
| assertEquals(9 * 1024 - 1, ((IntegerColumnStatistics) stats[1]).getMaximum()); |
| assertEquals(9 * 1024, stats[2].getNumberOfValues()); |
| assertEquals(9 * 1024, stats[3].getNumberOfValues()); |
| assertEquals("0", ((StringColumnStatistics) stats[3]).getMinimum()); |
| assertEquals("fff", ((StringColumnStatistics) stats[3]).getMaximum()); |
| assertEquals(9 * 1024, stats[4].getNumberOfValues()); |
| assertEquals("0000000000", ((StringColumnStatistics) stats[4]).getMinimum()); |
| assertEquals("022541e1bf", ((StringColumnStatistics) stats[4]).getMaximum()); |
| // check the stripe stats |
| stripeStats = reader.getStripeStatistics(); |
| for(int s=0; s < stripeStats.size(); ++s) { |
| long low = s * 1024; |
| long high = s * 1024 + 1023; |
| ColumnStatistics[] cs = stripeStats.get(s).getColumnStatistics(); |
| assertEquals("stripe " + s, 1024, cs[0].getNumberOfValues()); |
| assertEquals("stripe " + s, 1024, cs[1].getNumberOfValues()); |
| assertEquals("stripe " + s, low, ((IntegerColumnStatistics) cs[1]).getMinimum()); |
| assertEquals("stripe " + s, high, ((IntegerColumnStatistics) cs[1]).getMaximum()); |
| assertEquals("stripe " + s, 1024, cs[2].getNumberOfValues()); |
| assertEquals("stripe " + s, 1024, cs[3].getNumberOfValues()); |
| assertEquals("stripe " + s, Long.toHexString(low), |
| ((StringColumnStatistics) cs[3]).getMinimum()); |
| assertEquals("stripe " + s, s == 0 ? "ff" : Long.toHexString(high), |
| ((StringColumnStatistics) cs[3]).getMaximum()); |
| assertEquals("stripe " + s, 1024, cs[4].getNumberOfValues()); |
| assertEquals("stripe " + s, String.format("%010x", 1_000_001 * low), |
| ((StringColumnStatistics) cs[4]).getMinimum()); |
| assertEquals("stripe " + s, String.format("%010x", 1_000_001 * high), |
| ((StringColumnStatistics) cs[4]).getMaximum()); |
| } |
| // check the file contents |
| rows = reader.rows(); |
| for(int btch=0; btch < 9; ++btch) { |
| assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1024, batch.size); |
| for(int r=0; r < batch.size; ++r) { |
| long value = btch * 1024 + r; |
| assertEquals("batch " + btch + " row " + r, value, a.vector[r]); |
| assertEquals("batch " + btch + " row " + r, Long.toHexString(value), |
| c.toString(r)); |
| assertEquals("batch " + btch + " row " + r, |
| String.format("%010x", value * 1_000_001), d.toString(r)); |
| } |
| } |
| assertEquals(false, rows.nextBatch(batch)); |
| rows.close(); |
| reader.close(); |
| |
| Path merge2 = new Path(workDir, "TestVectorOrcFile.testEncryptMerge.merge2-" + |
| fileFormat.getName() + ".orc"); |
| fs.delete(merge2, false); |
| paths = OrcFile.mergeFiles(merge2, |
| OrcFile.writerOptions(conf).setKeyProvider(keystore), |
| Arrays.asList(input2, input4, input1, input5)); |
| |
| // make sure only input1 & input2 were merged |
| assertEquals(2, paths.size()); |
| assertEquals(true, paths.contains(input1)); |
| assertEquals(true, paths.contains(input2)); |
| |
| reader = OrcFile.createReader(merge2, OrcFile.readerOptions(conf)); |
| assertEquals(2 * 3 * 1024, reader.getNumberOfRows()); |
| assertEquals(CompressionKind.ZLIB, reader.getCompressionKind()); |
| assertEquals(0x800, reader.getCompressionSize()); |
| assertEquals(1000, reader.getRowIndexStride()); |
| assertEquals(fileFormat, reader.getFileVersion()); |
| assertEquals(6, reader.getStripes().size()); |
| assertEquals(2, reader.getColumnEncryptionKeys().length); |
| assertEquals(2, reader.getDataMasks().length); |
| assertEquals(2, reader.getEncryptionVariants().length); |
| reader.close(); |
| } |
| |
| Path exampleDir = new Path(System.getProperty("example.dir", |
| "../../examples/")); |
| |
| @Test |
| public void testZeroByteOrcFile() throws Exception { |
| // we only have to run this test once, since it is a 0 byte file. |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| Path zeroFile = new Path(exampleDir, "zero.orc"); |
| Reader reader = OrcFile.createReader(zeroFile, OrcFile.readerOptions(conf)); |
| assertEquals(0, reader.getNumberOfRows()); |
| assertEquals("struct<>", reader.getSchema().toString()); |
| assertEquals(CompressionKind.NONE, reader.getCompressionKind()); |
| assertEquals(0, reader.getRawDataSize()); |
| assertEquals(0, reader.getRowIndexStride()); |
| assertEquals(0, reader.getCompressionSize()); |
| assertEquals(0, reader.getMetadataSize()); |
| assertEquals(OrcFile.Version.CURRENT, reader.getFileVersion()); |
| assertEquals(0, reader.getStripes().size()); |
| assertEquals(0, reader.getStatistics().length); |
| assertEquals(0, reader.getMetadataKeys().size()); |
| assertEquals(OrcFile.CURRENT_WRITER, reader.getWriterVersion()); |
| VectorizedRowBatch batch = |
| TypeDescription.fromString("struct<>").createRowBatch(); |
| assertEquals(false, reader.rows().nextBatch(batch)); |
| } |
| |
| @Test |
| public void testFutureOrcFile() throws Exception { |
| Assume.assumeTrue(fileFormat == OrcFile.Version.V_0_11); |
| Path zeroFile = new Path(exampleDir, "version1999.orc"); |
| try { |
| OrcFile.createReader(zeroFile, OrcFile.readerOptions(conf)); |
| assertTrue("no exception for bad version", false); |
| } catch (UnknownFormatException uf) { |
| assertEquals("path is correct", "version1999.orc", uf.getPath().getName()); |
| assertEquals("19.99", uf.getVersionString()); |
| OrcProto.PostScript ps = uf.getPostscript(); |
| assertEquals("ORC", ps.getMagic()); |
| assertEquals(OrcProto.CompressionKind.NONE, ps.getCompression()); |
| } |
| } |
| |
| @Test |
| public void testEmptyDoubleStream() throws Exception { |
| TypeDescription schema = |
| TypeDescription.fromString("struct<list1:array<double>," + |
| "list2:array<float>>"); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf).setSchema(schema).version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = 2; |
| ListColumnVector list1 = (ListColumnVector) batch.cols[0]; |
| ListColumnVector list2 = (ListColumnVector) batch.cols[1]; |
| for(int r=0; r < batch.size; ++r) { |
| list1.offsets[r] = 0; |
| list1.lengths[r] = 0; |
| list2.offsets[r] = 0; |
| list2.lengths[r] = 0; |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf)); |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(2, batch.size); |
| list1 = (ListColumnVector) batch.cols[0]; |
| list2 = (ListColumnVector) batch.cols[1]; |
| for(int r=0; r < batch.size; ++r) { |
| assertEquals(0, list1.lengths[r]); |
| assertEquals(0, list2.lengths[r]); |
| } |
| assertFalse(rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| @Test |
| public void testPredicatePushdownForComplex() throws Exception { |
| TypeDescription schema = createComplexInnerSchema(); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .stripeSize(400000L) |
| .compress(CompressionKind.NONE) |
| .bufferSize(500) |
| .rowIndexStride(1000) |
| .version(fileFormat)); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.ensureSize(3500); |
| batch.size = 3500; |
| for(int i=0; i < 3500; ++i) { |
| ((LongColumnVector) batch.cols[0]).vector[i] = i; |
| ((LongColumnVector)((StructColumnVector) batch.cols[1]).fields[0]).vector[i] = i * 300; |
| ((BytesColumnVector)((StructColumnVector) batch.cols[1]).fields[1]).setVal(i, |
| Integer.toHexString(10*i).getBytes(StandardCharsets.UTF_8)); |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).filesystem(fs)); |
| assertEquals(3500, reader.getNumberOfRows()); |
| |
| SearchArgument sarg = SearchArgumentFactory.newBuilder() |
| .startAnd() |
| .startNot() |
| .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300000L) |
| .end() |
| .lessThan("complex.int2", PredicateLeaf.Type.LONG, 600000L) |
| .end() |
| .build(); |
| |
| RecordReader rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true, true, true, true}) |
| .searchArgument(sarg, new String[]{null, "int1", "complex","int2","string1"})); |
| batch = reader.getSchema().createRowBatch(2000); |
| LongColumnVector ints1 = (LongColumnVector) batch.cols[0]; |
| StructColumnVector struct1 = (StructColumnVector) batch.cols[1]; |
| LongColumnVector ints2 = (LongColumnVector) struct1.fields[0]; |
| BytesColumnVector strs = (BytesColumnVector) struct1.fields[1]; |
| |
| Assert.assertEquals(1000L, rows.getRowNumber()); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1000, batch.size); |
| |
| for(int i=1000; i < 2000; ++i) { |
| assertEquals(i,ints1.vector[i-1000]); |
| assertEquals(300 * i, ints2.vector[i - 1000]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i - 1000)); |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| |
| |
| // look through the file with no rows selected |
| sarg = SearchArgumentFactory.newBuilder() |
| .startAnd() |
| .lessThan("complex.int2", PredicateLeaf.Type.LONG, 0L) |
| .end() |
| .build(); |
| rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true, true, true, true}) |
| .searchArgument(sarg, new String[]{null, "int1",null,"int2","string1"})); |
| Assert.assertEquals(3500L, rows.getRowNumber()); |
| assertTrue(!rows.nextBatch(batch)); |
| |
| // select first 100 and last 100 rows |
| sarg = SearchArgumentFactory.newBuilder() |
| .startOr() |
| .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 100) |
| .startNot() |
| .lessThan("complex.int2", PredicateLeaf.Type.LONG, 300L * 3400) |
| .end() |
| .end() |
| .build(); |
| rows = reader.rows(reader.options() |
| .range(0L, Long.MAX_VALUE) |
| .include(new boolean[]{true, true,true,true, true}) |
| .searchArgument(sarg, new String[]{null, "int1",null, "int2","string1"})); |
| Assert.assertEquals(0, rows.getRowNumber()); |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(1000, batch.size); |
| Assert.assertEquals(3000, rows.getRowNumber()); |
| |
| for(int i=0; i < 1000; ++i) { |
| assertEquals(300 * i, ints2.vector[i]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i)); |
| } |
| |
| Assert.assertEquals(true, rows.nextBatch(batch)); |
| assertEquals(500, batch.size); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| for(int i=3000; i < 3500; ++i) { |
| assertEquals(300 * i, ints2.vector[i - 3000]); |
| assertEquals(Integer.toHexString(10*i), strs.toString(i - 3000)); |
| } |
| Assert.assertEquals(false, rows.nextBatch(batch)); |
| Assert.assertEquals(3500, rows.getRowNumber()); |
| } |
| |
| @Test |
| public void testColumnEncryption() throws Exception { |
| Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); |
| final int ROWS = 1000; |
| final int SEED = 2; |
| final Random random = new Random(SEED); |
| |
| TypeDescription schema = |
| TypeDescription.fromString("struct<i:int,norm:int,x:array<string>>"); |
| |
| byte[] piiKey = new byte[16]; |
| random.nextBytes(piiKey); |
| byte[] creditKey = new byte[32]; |
| random.nextBytes(creditKey); |
| InMemoryKeystore keys = new InMemoryKeystore(random) |
| .addKey("pii", EncryptionAlgorithm.AES_CTR_128, piiKey) |
| .addKey("credit", EncryptionAlgorithm.AES_CTR_256, creditKey); |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .version(fileFormat) |
| .setKeyProvider(keys) |
| .encrypt("pii:i;credit:x")); |
| VectorizedRowBatch batch = schema.createRowBatch(); |
| batch.size = ROWS; |
| LongColumnVector i = (LongColumnVector) batch.cols[0]; |
| LongColumnVector norm = (LongColumnVector) batch.cols[1]; |
| ListColumnVector x = (ListColumnVector) batch.cols[2]; |
| BytesColumnVector xElem = (BytesColumnVector) x.child; |
| xElem.ensureSize(3 * ROWS, false); |
| for(int r=0; r < ROWS; ++r) { |
| i.vector[r] = r * 3; |
| norm.vector[r] = r * 5; |
| int start = x.childCount; |
| x.offsets[r] = start; |
| x.lengths[r] = 3; |
| x.childCount += x.lengths[r]; |
| for(int child=0; child < x.lengths[r]; ++child) { |
| xElem.setVal(start + child, |
| String.format("%d.%d", r, child).getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| writer.addRowBatch(batch); |
| writer.close(); |
| |
| // Read without any keys |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf) |
| .setKeyProvider(new InMemoryKeystore())); |
| ColumnStatistics[] stats = reader.getStatistics(); |
| assertEquals(ROWS, stats[0].getNumberOfValues()); |
| |
| assertEquals(0, stats[1].getNumberOfValues()); |
| assertEquals(true, stats[1].hasNull()); |
| assertEquals(ROWS, stats[2].getNumberOfValues()); |
| assertEquals(0, ((IntegerColumnStatistics) stats[2]).getMinimum()); |
| assertEquals(ROWS * 5 - 5, ((IntegerColumnStatistics) stats[2]).getMaximum()); |
| assertEquals(0, stats[3].getNumberOfValues()); |
| assertEquals(0, stats[4].getNumberOfValues()); |
| |
| RecordReader rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| i = (LongColumnVector) batch.cols[0]; |
| norm = (LongColumnVector) batch.cols[1]; |
| x = (ListColumnVector) batch.cols[2]; |
| |
| // ensure that we get the right number of rows with all nulls |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(ROWS, batch.size); |
| assertEquals(true, i.isRepeating); |
| assertEquals(false, i.noNulls); |
| assertEquals(true, i.isNull[0]); |
| assertEquals(true, x.isRepeating); |
| assertEquals(false, x.noNulls); |
| assertEquals(true, x.isNull[0]); |
| for(int r=0; r < ROWS; ++r) { |
| assertEquals("row " + r, r * 5, norm.vector[r]); |
| } |
| assertFalse(rows.nextBatch(batch)); |
| rows.close(); |
| |
| // Add a new version of the pii key |
| random.nextBytes(piiKey); |
| keys.addKey("pii", 1, EncryptionAlgorithm.AES_CTR_128, piiKey); |
| |
| // Read with the keys |
| reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf) |
| .setKeyProvider(keys)); |
| stats = reader.getStatistics(); |
| assertEquals(ROWS, stats[0].getNumberOfValues()); |
| assertEquals(ROWS, stats[1].getNumberOfValues()); |
| assertEquals(0, ((IntegerColumnStatistics) stats[1]).getMinimum()); |
| assertEquals(3 * (ROWS - 1), ((IntegerColumnStatistics) stats[1]).getMaximum()); |
| assertEquals(0, ((IntegerColumnStatistics) stats[2]).getMinimum()); |
| assertEquals(5 * (ROWS - 1), ((IntegerColumnStatistics) stats[2]).getMaximum()); |
| assertEquals(ROWS, stats[3].getNumberOfValues()); |
| assertEquals(3 * ROWS, stats[4].getNumberOfValues()); |
| assertEquals("0.0", ((StringColumnStatistics)stats[4]).getMinimum()); |
| assertEquals("999.2", ((StringColumnStatistics)stats[4]).getMaximum()); |
| |
| rows = reader.rows(); |
| batch = reader.getSchema().createRowBatch(); |
| i = (LongColumnVector) batch.cols[0]; |
| norm = (LongColumnVector) batch.cols[1]; |
| x = (ListColumnVector) batch.cols[2]; |
| xElem = (BytesColumnVector) x.child; |
| assertTrue(rows.nextBatch(batch)); |
| assertEquals(ROWS, batch.size); |
| assertEquals(false, i.isRepeating); |
| assertEquals(false, x.isRepeating); |
| assertEquals(false, xElem.isRepeating); |
| assertEquals(true, i.noNulls); |
| assertEquals(true, x.noNulls); |
| assertEquals(true, xElem.noNulls); |
| for(int r=0; r < ROWS; ++r) { |
| assertEquals("row " + r, r * 3, i.vector[r]); |
| assertEquals("row " + r, r * 5, norm.vector[r]); |
| assertEquals("row " + r, r * 3, x.offsets[r]); |
| assertEquals("row " + r, 3, x.lengths[r]); |
| for(int child=0; child < x.lengths[r]; ++child) { |
| assertEquals("row " + r + "." + child, String.format("%d.%d", r, child), |
| xElem.toString((int) x.offsets[r] + child)); |
| } |
| } |
| assertFalse(rows.nextBatch(batch)); |
| rows.close(); |
| } |
| |
| @Test |
| public void testMultiStripeColumnEncryption() throws Exception { |
| Assume.assumeTrue(fileFormat != OrcFile.Version.V_0_11); |
| final EncryptionAlgorithm algorithm = EncryptionAlgorithm.AES_CTR_128; |
| final int BATCHES = 100; |
| final int SEED = 3; |
| final Random random = new Random(SEED); |
| |
| TypeDescription schema = TypeDescription.fromString( |
| "struct<dec:decimal(20,4)," + |
| "dt:date," + |
| "time:timestamp," + |
| "dbl:double," + |
| "bool:boolean," + |
| "bin:binary>"); |
| |
| InMemoryKeystore allKeys = new InMemoryKeystore(); |
| byte[][] masterKeys = new byte[6][]; |
| for(int k=0; k < masterKeys.length; ++k) { |
| masterKeys[k] = new byte[algorithm.keyLength()]; |
| random.nextBytes(masterKeys[k]); |
| allKeys.addKey("key_" + k, algorithm, masterKeys[k]); |
| } |
| |
| Writer writer = OrcFile.createWriter(testFilePath, |
| OrcFile.writerOptions(conf) |
| .setSchema(schema) |
| .version(fileFormat) |
| .stripeSize(10000) |
| .setKeyProvider(allKeys) |
| .encrypt("key_0:dec;key_1:dt;key_2:time;key_3:dbl;key_4:bool;key_5:bin")); |
| // Set size to 1000 precisely so that stripes are exactly 5000 rows long. |
| VectorizedRowBatch batch = schema.createRowBatch(1000); |
| DecimalColumnVector dec = (DecimalColumnVector) batch.cols[0]; |
| LongColumnVector dt = (LongColumnVector) batch.cols[1]; |
| TimestampColumnVector time = (TimestampColumnVector) batch.cols[2]; |
| DoubleColumnVector dbl = (DoubleColumnVector) batch.cols[3]; |
| LongColumnVector bool = (LongColumnVector) batch.cols[4]; |
| BytesColumnVector bin = (BytesColumnVector) batch.cols[5]; |
| // Generate 100 batches of 1,000 rows each |
| batch.size = 1000; |
| dec.isRepeating = true; |
| dt.isRepeating = true; |
| time.isRepeating = true; |
| dbl.isRepeating = true; |
| bool.isRepeating = true; |
| bin.isRepeating = true; |
| for(int b=0; b < BATCHES; ++b) { |
| dec.set(0, new HiveDecimalWritable(String.format("%d.%03d", b, b))); |
| dt.vector[0] = new DateWritable(new Date(96 + b, 12, 11)).getDays(); |
| time.set(0, Timestamp.valueOf(String.format("2014-12-14 12:00:00.%04d", b))); |
| dbl.vector[0] = b + 0.5; |
| bool.vector[0] = b % 2; |
| bin.setVal(0, Integer.toString(b).getBytes(StandardCharsets.UTF_8)); |
| writer.addRowBatch(batch); |
| } |
| writer.close(); |
| |
| // Read without any keys |
| Reader reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf) |
| .setKeyProvider(new InMemoryKeystore())); |
| checkHasData(reader.rows(), batch, BATCHES, |
| false, false, false, false, false, false); |
| |
| // read with all of the keys |
| reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf) |
| .setKeyProvider(allKeys)); |
| checkHasData(reader.rows(), batch, BATCHES, |
| true, true, true, true, true, true); |
| |
| // try enabling each key by itself |
| for(int c=0; c < 6; ++c) { |
| InMemoryKeystore single = new InMemoryKeystore(); |
| single.addKey("key_" + c, algorithm, masterKeys[c]); |
| reader = OrcFile.createReader(testFilePath, |
| OrcFile.readerOptions(conf).setKeyProvider(single)); |
| boolean[] hasData = new boolean[6]; |
| hasData[c] = true; |
| checkHasData(reader.rows(), batch, BATCHES, hasData); |
| } |
| } |
| |
| private void checkHasData(RecordReader reader, VectorizedRowBatch batch, |
| int BATCHES, boolean... hasData) throws IOException { |
| for(int b=0; b < BATCHES; ++b) { |
| assertEquals("batch " + b, true, reader.nextBatch(batch)); |
| for(int c=0; c < hasData.length; c++) { |
| if (hasData[c]) { |
| // the expected value |
| String expected = null; |
| // a function from the row to the value as a string |
| IntFunction<String> actual = row -> null; |
| switch (c) { |
| case 0: |
| expected = new HiveDecimalWritable(String.format("%d.%03d", b, b)).toString(); |
| actual = row -> ((DecimalColumnVector) batch.cols[0]).vector[row].toString(); |
| break; |
| case 1: |
| expected = Long.toString(new DateWritable(new Date(96 + b, 12, 11)).getDays()); |
| actual = row -> Long.toString(((LongColumnVector) batch.cols[1]).vector[row]); |
| break; |
| case 2: |
| expected = Timestamp.valueOf(String.format("2014-12-14 12:00:00.%04d", b)).toString(); |
| actual = row -> ((TimestampColumnVector) batch.cols[2]).asScratchTimestamp(row).toString(); |
| break; |
| case 3: |
| expected = Double.toString(b + 0.5); |
| actual = row -> Double.toString(((DoubleColumnVector) batch.cols[3]).vector[row]); |
| break; |
| case 4: |
| expected = Long.toString(b % 2); |
| actual = row -> Long.toString(((LongColumnVector) batch.cols[4]).vector[row]); |
| break; |
| default: |
| expected = Integer.toString(b); |
| actual = row -> ((BytesColumnVector) batch.cols[5]).toString(row); |
| break; |
| } |
| assertEquals("batch " + b + " column " + c, true, batch.cols[c].noNulls); |
| assertEquals("batch " + b + " column " + c + " row 0", expected, actual.apply(0)); |
| // Not all of the readers set isRepeating, so if it isn't set, check the values. |
| if (!batch.cols[c].isRepeating) { |
| for(int r=1; r < batch.size; ++r) { |
| assertEquals("batch " + b + " column " + c + " row " + r, expected, actual.apply(r)); |
| } |
| } |
| } else { |
| assertEquals("batch " + b + " column " + c, true, batch.cols[c].isRepeating); |
| assertEquals("batch " + b + " column " + c, true, batch.cols[c].isNull[0]); |
| } |
| } |
| } |
| assertEquals("end", false, reader.nextBatch(batch)); |
| reader.close(); |
| } |
| } |