blob: dc1271574b86253c708a7bf40510fca7cf02eea6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.impl;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.atLeastOnce;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.sql.Date;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TimeZone;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.hive.common.io.DiskRangeList;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.impl.reader.ReaderEncryption;
import org.apache.orc.impl.reader.StripePlanner;
import org.apache.orc.impl.writer.StreamOptions;
import org.apache.orc.util.BloomFilter;
import org.apache.orc.DataReader;
import org.apache.orc.RecordReader;
import org.apache.orc.TestVectorOrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.impl.RecordReaderImpl.Location;
import org.apache.orc.impl.RecordReaderImpl.SargApplier;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.OrcProto;
import org.apache.orc.util.BloomFilterIO;
import org.apache.orc.util.BloomFilterUtf8;
import org.junit.Test;
public class TestRecordReaderImpl {
@Test
public void testFindColumn() throws Exception {
Configuration conf = new Configuration();
TypeDescription file = TypeDescription.fromString("struct<a:int,c:string,e:int>");
TypeDescription reader = TypeDescription.fromString("struct<a:int,b:double,c:string,d:double,e:bigint>");
SchemaEvolution evo = new SchemaEvolution(file, reader, new Reader.Options(conf));
assertEquals(1, RecordReaderImpl.findColumns(evo, "a"));
assertEquals(-1, RecordReaderImpl.findColumns(evo, "b"));
assertEquals(2, RecordReaderImpl.findColumns(evo, "c"));
assertEquals(-1, RecordReaderImpl.findColumns(evo, "d"));
assertEquals(3, RecordReaderImpl.findColumns(evo, "e"));
}
@Test
public void testForcePositionalEvolution() throws Exception {
Configuration conf = new Configuration();
Path oldFilePath = new Path(TestVectorOrcFile.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath,
OrcFile.readerOptions(conf).filesystem(FileSystem.getLocal(conf)));
TypeDescription fileSchema =
TypeDescription.fromString("struct<col0:boolean,col1:tinyint,col2:smallint,"
+ "col3:int,col4:bigint,col5:float,col6:double,col7:"
+ "binary,col8:string,col9:struct<list:array<struct<int1:int,"
+ "string1:string>>>,col10:array<struct<int1:int,string1:string>>,"
+ "col11:map<string,struct<int1:int,string1:string>>,col12:timestamp,"
+ "col13:decimal(38,10)>");
SchemaEvolution evo = new SchemaEvolution(fileSchema, reader.getSchema(),
new Reader.Options(conf).forcePositionalEvolution(true));
assertEquals(4, RecordReaderImpl.findColumns(evo, "int1"));
evo = new SchemaEvolution(fileSchema, reader.getSchema(),
new Reader.Options(conf).forcePositionalEvolution(false));
assertEquals(-1, RecordReaderImpl.findColumns(evo, "int1"));
TypeDescription acidSchema = SchemaEvolution.createEventSchema(fileSchema);
SchemaEvolution evoAcid =
new SchemaEvolution(acidSchema, reader.getSchema(),
new Reader.Options(conf).forcePositionalEvolution(true));
// ahead by 6 for 1 struct + 5 for row-id
assertEquals(6+4, RecordReaderImpl.findColumns(evoAcid, "int1"));
evoAcid =
new SchemaEvolution(acidSchema, reader.getSchema(),
new Reader.Options(conf).forcePositionalEvolution(false));
assertEquals(-1, RecordReaderImpl.findColumns(evoAcid, "int1"));
}
/**
* Create a predicate leaf. This is used by another test.
*/
public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
PredicateLeaf.Type type,
String columnName,
Object literal,
List<Object> literalList) {
return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
literal, literalList);
}
static class BufferInStream
extends InputStream implements PositionedReadable, Seekable {
private final byte[] buffer;
private final int length;
private int position = 0;
BufferInStream(byte[] bytes, int length) {
this.buffer = bytes;
this.length = length;
}
@Override
public int read() {
if (position < length) {
return buffer[position++];
}
return -1;
}
@Override
public int read(byte[] bytes, int offset, int length) {
int lengthToRead = Math.min(length, this.length - this.position);
if (lengthToRead >= 0) {
for(int i=0; i < lengthToRead; ++i) {
bytes[offset + i] = buffer[position++];
}
return lengthToRead;
} else {
return -1;
}
}
@Override
public int read(long position, byte[] bytes, int offset, int length) {
this.position = (int) position;
return read(bytes, offset, length);
}
@Override
public void readFully(long position, byte[] bytes, int offset,
int length) throws IOException {
this.position = (int) position;
while (length > 0) {
int result = read(bytes, offset, length);
offset += result;
length -= result;
if (result < 0) {
throw new IOException("Read past end of buffer at " + offset);
}
}
}
@Override
public void readFully(long position, byte[] bytes) throws IOException {
readFully(position, bytes, 0, bytes.length);
}
@Override
public void seek(long position) {
this.position = (int) position;
}
@Override
public long getPos() {
return position;
}
@Override
public boolean seekToNewSource(long position) throws IOException {
this.position = (int) position;
return false;
}
}
@Test
public void testMaxLengthToReader() throws Exception {
Configuration conf = new Configuration();
OrcProto.Type rowType = OrcProto.Type.newBuilder()
.setKind(OrcProto.Type.Kind.STRUCT).build();
OrcProto.Footer footer = OrcProto.Footer.newBuilder()
.setHeaderLength(0).setContentLength(0).setNumberOfRows(0)
.setRowIndexStride(0).addTypes(rowType).build();
OrcProto.PostScript ps = OrcProto.PostScript.newBuilder()
.setCompression(OrcProto.CompressionKind.NONE)
.setFooterLength(footer.getSerializedSize())
.setMagic("ORC").addVersion(0).addVersion(11).build();
DataOutputBuffer buffer = new DataOutputBuffer();
footer.writeTo(buffer);
ps.writeTo(buffer);
buffer.write(ps.getSerializedSize());
FileSystem fs = mock(FileSystem.class);
FSDataInputStream file =
new FSDataInputStream(new BufferInStream(buffer.getData(),
buffer.getLength()));
Path p = new Path("/dir/file.orc");
when(fs.open(eq(p))).thenReturn(file);
OrcFile.ReaderOptions options = OrcFile.readerOptions(conf);
options.filesystem(fs);
options.maxLength(buffer.getLength());
when(fs.getFileStatus(eq(p)))
.thenReturn(new FileStatus(10, false, 3, 3000, 0, p));
Reader reader = OrcFile.createReader(p, options);
assertEquals(0, reader.getNumberOfRows());
}
static class StubPredicate implements PredicateLeaf {
final PredicateLeaf.Type type;
StubPredicate(PredicateLeaf.Type type) {
this.type = type;
}
@Override
public Operator getOperator() {
return null;
}
@Override
public Type getType() {
return type;
}
@Override
public String getColumnName() {
return null;
}
@Override
public Object getLiteral() {
return null;
}
@Override
public List<Object> getLiteralList() {
return null;
}
}
static Location compareToRange(PredicateLeaf.Type type, Comparable point,
Comparable min, Comparable max) {
PredicateLeaf predicate = new StubPredicate(type);
return new RecordReaderImpl.ValueRange(predicate, min, max, true)
.compare(point);
}
@Test
public void testCompareToRangeInt() {
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.LONG, 19L, 20L, 40L));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.LONG, 41L, 20L, 40L));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.LONG, 20L, 20L, 40L));
assertEquals(Location.MIDDLE,
compareToRange(PredicateLeaf.Type.LONG, 21L, 20L, 40L));
assertEquals(Location.MAX,
compareToRange(PredicateLeaf.Type.LONG, 40L, 20L, 40L));
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.LONG, 0L, 1L, 1L));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.LONG, 1L, 1L, 1L));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.LONG, 2L, 1L, 1L));
}
@Test
public void testCompareToRangeString() {
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.STRING, "a", "b", "c"));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.STRING, "d", "b", "c"));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.STRING, "b", "b", "c"));
assertEquals(Location.MIDDLE,
compareToRange(PredicateLeaf.Type.STRING, "bb", "b", "c"));
assertEquals(Location.MAX,
compareToRange(PredicateLeaf.Type.STRING, "c", "b", "c"));
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.STRING, "a", "b", "b"));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.STRING, "b", "b", "b"));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.STRING, "c", "b", "b"));
}
@Test
public void testCompareToCharNeedConvert() {
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.STRING, "apple", "hello", "world"));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.STRING, "zombie", "hello", "world"));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.STRING, "hello", "hello", "world"));
assertEquals(Location.MIDDLE,
compareToRange(PredicateLeaf.Type.STRING, "pilot", "hello", "world"));
assertEquals(Location.MAX,
compareToRange(PredicateLeaf.Type.STRING, "world", "hello", "world"));
assertEquals(Location.BEFORE,
compareToRange(PredicateLeaf.Type.STRING, "apple", "hello", "hello"));
assertEquals(Location.MIN,
compareToRange(PredicateLeaf.Type.STRING, "hello", "hello", "hello"));
assertEquals(Location.AFTER,
compareToRange(PredicateLeaf.Type.STRING, "zombie", "hello", "hello"));
}
@Test
public void testGetMin() throws Exception {
assertEquals(10L, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null, createIntStats(10L, 100L)),
new StubPredicate(PredicateLeaf.Type.LONG), true).lower);
assertEquals(10.0d, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
.setMinimum(10.0d).setMaximum(100.0d).build()).build()),
new StubPredicate(PredicateLeaf.Type.FLOAT), true).lower);
assertEquals(null, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
.build()), new StubPredicate(PredicateLeaf.Type.STRING),true).lower);
assertEquals("a", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder()
.setMinimum("a").setMaximum("b").build()).build()),
new StubPredicate(PredicateLeaf.Type.STRING), true).lower);
assertEquals("hello", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null, createStringStats("hello", "world")),
new StubPredicate(PredicateLeaf.Type.STRING), true).lower);
assertEquals(new HiveDecimalWritable("111.1"), RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
createDecimalStats("111.1", "112.1")),
new StubPredicate(PredicateLeaf.Type.DECIMAL), true).lower);
}
private static OrcProto.ColumnStatistics createIntStats(Long min,
Long max) {
OrcProto.IntegerStatistics.Builder intStats =
OrcProto.IntegerStatistics.newBuilder();
if (min != null) {
intStats.setMinimum(min);
}
if (max != null) {
intStats.setMaximum(max);
}
return OrcProto.ColumnStatistics.newBuilder()
.setIntStatistics(intStats.build()).build();
}
private static OrcProto.ColumnStatistics createBooleanStats(int n, int trueCount) {
OrcProto.BucketStatistics.Builder boolStats = OrcProto.BucketStatistics.newBuilder();
boolStats.addCount(trueCount);
return OrcProto.ColumnStatistics.newBuilder()
.setNumberOfValues(n).setBucketStatistics(
boolStats.build()).build();
}
private static OrcProto.ColumnStatistics createIntStats(int min, int max) {
OrcProto.IntegerStatistics.Builder intStats = OrcProto.IntegerStatistics.newBuilder();
intStats.setMinimum(min);
intStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder()
.setIntStatistics(intStats.build()).build();
}
private static OrcProto.ColumnStatistics createDoubleStats(double min, double max) {
OrcProto.DoubleStatistics.Builder dblStats = OrcProto.DoubleStatistics.newBuilder();
dblStats.setMinimum(min);
dblStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder()
.setDoubleStatistics(dblStats.build()).build();
}
//fixme
private static OrcProto.ColumnStatistics createStringStats(String min, String max,
boolean hasNull) {
OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
strStats.setMinimum(min);
strStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
.setHasNull(hasNull).build();
}
private static OrcProto.ColumnStatistics createStringStats(String min, String max) {
OrcProto.StringStatistics.Builder strStats = OrcProto.StringStatistics.newBuilder();
strStats.setMinimum(min);
strStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder().setStringStatistics(strStats.build())
.build();
}
private static OrcProto.ColumnStatistics createDateStats(int min, int max) {
OrcProto.DateStatistics.Builder dateStats = OrcProto.DateStatistics.newBuilder();
dateStats.setMinimum(min);
dateStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder()
.setDateStatistics(dateStats.build()).build();
}
private static final TimeZone utcTz = TimeZone.getTimeZone("UTC");
private static OrcProto.ColumnStatistics createTimestampStats(String min, String max) {
OrcProto.TimestampStatistics.Builder tsStats = OrcProto.TimestampStatistics.newBuilder();
tsStats.setMinimumUtc(getUtcTimestamp(min));
tsStats.setMaximumUtc(getUtcTimestamp(max));
return OrcProto.ColumnStatistics.newBuilder()
.setTimestampStatistics(tsStats.build()).build();
}
private static OrcProto.ColumnStatistics createDecimalStats(String min, String max) {
return createDecimalStats(min, max, true);
}
private static OrcProto.ColumnStatistics createDecimalStats(String min, String max,
boolean hasNull) {
OrcProto.DecimalStatistics.Builder decStats = OrcProto.DecimalStatistics.newBuilder();
decStats.setMinimum(min);
decStats.setMaximum(max);
return OrcProto.ColumnStatistics.newBuilder().setDecimalStatistics(decStats.build())
.setHasNull(hasNull).build();
}
@Test
public void testGetMax() {
assertEquals(100L, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null, createIntStats(10L, 100L)),
new StubPredicate(PredicateLeaf.Type.LONG), true).upper);
assertEquals(100.0d, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
.setMinimum(10.0d).setMaximum(100.0d).build()).build()),
new StubPredicate(PredicateLeaf.Type.FLOAT), true).upper);
assertEquals(null, RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder().build())
.build()), new StubPredicate(PredicateLeaf.Type.STRING), true).upper);
assertEquals("b", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
OrcProto.ColumnStatistics.newBuilder()
.setStringStatistics(OrcProto.StringStatistics.newBuilder()
.setMinimum("a").setMaximum("b").build()).build()),
new StubPredicate(PredicateLeaf.Type.STRING), true).upper);
assertEquals("world", RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
createStringStats("hello", "world")),
new StubPredicate(PredicateLeaf.Type.STRING), true).upper);
assertEquals(new HiveDecimalWritable("112.1"), RecordReaderImpl.getValueRange(
ColumnStatisticsImpl.deserialize(null,
createDecimalStats("111.1", "112.1")),
new StubPredicate(PredicateLeaf.Type.DECIMAL), true).upper);
}
static TruthValue evaluateBoolean(OrcProto.ColumnStatistics stats,
PredicateLeaf predicate) {
OrcProto.ColumnEncoding encoding =
OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
.build();
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
encoding, null,
OrcFile.WriterVersion.ORC_135, TypeDescription.createBoolean());
}
static TruthValue evaluateInteger(OrcProto.ColumnStatistics stats,
PredicateLeaf predicate) {
OrcProto.ColumnEncoding encoding =
OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2)
.build();
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
encoding, null,
OrcFile.WriterVersion.ORC_135, TypeDescription.createLong());
}
static TruthValue evaluateDouble(OrcProto.ColumnStatistics stats,
PredicateLeaf predicate) {
OrcProto.ColumnEncoding encoding =
OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
.build();
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
encoding, null,
OrcFile.WriterVersion.ORC_135, TypeDescription.createDouble());
}
static TruthValue evaluateTimestamp(OrcProto.ColumnStatistics stats,
PredicateLeaf predicate,
boolean include135,
boolean useUTCTimestamp) {
OrcProto.ColumnEncoding encoding =
OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
.build();
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, null,
encoding, null,
include135 ? OrcFile.WriterVersion.ORC_135: OrcFile.WriterVersion.ORC_101,
TypeDescription.createTimestamp(), useUTCTimestamp);
}
static TruthValue evaluateTimestampBloomfilter(OrcProto.ColumnStatistics stats,
PredicateLeaf predicate,
BloomFilter bloom,
OrcFile.WriterVersion version,
boolean useUTCTimestamp) {
OrcProto.ColumnEncoding.Builder encoding =
OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
if (version.includes(OrcFile.WriterVersion.ORC_135)) {
encoding.setBloomEncoding(BloomFilterIO.Encoding.UTF8_UTC.getId());
}
OrcProto.Stream.Kind kind =
version.includes(OrcFile.WriterVersion.ORC_101) ?
OrcProto.Stream.Kind.BLOOM_FILTER_UTF8 :
OrcProto.Stream.Kind.BLOOM_FILTER;
OrcProto.BloomFilter.Builder builder =
OrcProto.BloomFilter.newBuilder();
BloomFilterIO.serialize(builder, bloom);
return RecordReaderImpl.evaluatePredicateProto(stats, predicate, kind,
encoding.build(), builder.build(), version,
TypeDescription.createTimestamp(), useUTCTimestamp);
}
@Test
public void testPredEvalWithBooleanStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
assertEquals(TruthValue.YES_NO,
evaluateBoolean(createBooleanStats(10, 10), pred));
assertEquals(TruthValue.NO,
evaluateBoolean(createBooleanStats(10, 0), pred));
pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", true, null);
assertEquals(TruthValue.YES_NO,
evaluateBoolean(createBooleanStats(10, 10), pred));
assertEquals(TruthValue.NO,
evaluateBoolean(createBooleanStats(10, 0), pred));
pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.BOOLEAN, "x", false, null);
assertEquals(TruthValue.NO,
evaluateBoolean(createBooleanStats(10, 10), pred));
assertEquals(TruthValue.YES_NO,
evaluateBoolean(createBooleanStats(10, 0), pred));
}
@Test
public void testPredEvalWithIntStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(10, 100), pred));
// Stats gets converted to column type. "15" is outside of "10" and "100"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
evaluateInteger(createIntStats(10, 100), pred));
// Integer stats will not be converted date because of days/seconds/millis ambiguity
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
try {
evaluateInteger(createIntStats(10, 100), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Long to DATE", ia.getMessage());
}
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
try {
evaluateInteger(createIntStats(10, 100), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Long to TIMESTAMP", ia.getMessage());
}
}
@Test
public void testPredEvalWithDoubleStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
// Stats gets converted to column type. "15.0" is outside of "10.0" and "100.0"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
// Double is not converted to date type because of days/seconds/millis ambiguity
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
try {
evaluateDouble(createDoubleStats(10.0, 100.0), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Double to DATE", ia.getMessage());
}
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15*1000L), null);
assertEquals(TruthValue.YES_NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150*1000L), null);
assertEquals(TruthValue.NO,
evaluateDouble(createDoubleStats(10.0, 100.0), pred));
}
@Test
public void testPredEvalWithStringStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 100L, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("10", "1000"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 100.0, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("10", "1000"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "100", null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("10", "1000"), pred));
// IllegalArgumentException is thrown when converting String to Date, hence YES_NO
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(100).get(), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDateStats(10, 1000), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("100"), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("10", "1000"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(100), null);
try {
evaluateInteger(createStringStats("10", "1000"), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from String to TIMESTAMP", ia.getMessage());
}
}
@Test
public void testPredEvalWithDateStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
// Date to Integer conversion is not possible.
try {
evaluateInteger(createDateStats(10, 100), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Date to LONG", ia.getMessage());
}
// Date to Float conversion is also not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
try {
evaluateInteger(createDateStats(10, 100), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Date to FLOAT", ia.getMessage());
}
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "1970-01-11", null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15.1", null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "__a15__1", null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "2000-01-16", null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "1970-01-16", null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(150).get(), null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
// Date to Decimal conversion is also not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
try {
evaluateInteger(createDateStats(10, 100), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from Date to DECIMAL", ia.getMessage());
}
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null);
assertEquals(TruthValue.NO,
evaluateInteger(createDateStats(10, 100), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15L * 24L * 60L * 60L * 1000L), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDateStats(10, 100), pred));
}
@Test
public void testPredEvalWithDecimalStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
// "15" out of range of "10.0" and "100.0"
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", "15", null);
assertEquals(TruthValue.NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
// Decimal to Date not possible.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", new DateWritable(15).get(), null);
try {
evaluateInteger(createDecimalStats("10.0", "100.0"), pred);
fail("evaluate should throw");
} catch (RecordReaderImpl.SargCastException ia) {
assertEquals("ORC SARGS could not convert from HiveDecimal to DATE", ia.getMessage());
}
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15 * 1000L), null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(150 * 1000L), null);
assertEquals(TruthValue.NO,
evaluateInteger(createDecimalStats("10.0", "100.0"), pred));
}
@Test
public void testPredEvalWithTimestampStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", Timestamp.valueOf("2017-01-01 00:00:00"), null);
assertEquals(TruthValue.YES_NO,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00",
"2018-01-01 00:00:00"), pred, true, false));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.FLOAT, "x", 15.0, null);
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"),
pred, true, false));
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"),
pred, true, false));
// pre orc-135 should always be yes_no_null.
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.TIMESTAMP, "x", Timestamp.valueOf("2017-01-01 00:00:00"), null);
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2017-01-01 00:00:00"),
pred, false, false));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.STRING, "x", Timestamp.valueOf("2017-01-01 00:00:00").toString(), null);
assertEquals(TruthValue.YES_NO,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2018-01-01 00:00:00"),
pred, true, false));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DATE, "x", Date.valueOf("2016-01-01"), null);
assertEquals(TruthValue.NO,
evaluateTimestamp(createTimestampStats("2017-01-01 00:00:00", "2017-01-01 00:00:00"),
pred, true, false));
assertEquals(TruthValue.YES_NO,
evaluateTimestamp(createTimestampStats("2015-01-01 00:00:00", "2016-01-01 00:00:00"),
pred, true, false));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS,
PredicateLeaf.Type.DECIMAL, "x", new HiveDecimalWritable("15"), null);
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestamp(createTimestampStats("2015-01-01 00:00:00", "2016-01-01 00:00:00"),
pred, true, false));
}
@Test
public void testEquals() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(20L, 30L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(15L, 30L), pred)) ;
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 30L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 15L), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(0L, 10L), pred));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(15L, 15L), pred));
}
@Test
public void testNullSafeEquals() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO,
evaluateInteger(createIntStats(20L, 30L), pred));
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(15L, 30L), pred));
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(10L, 30L), pred));
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(10L, 15L), pred));
assertEquals(TruthValue.NO,
evaluateInteger(createIntStats(0L, 10L), pred));
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(15L, 15L), pred));
}
@Test
public void testLessThan() throws Exception {
PredicateLeaf lessThan = createPredicateLeaf
(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(20L, 30L), lessThan));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(15L, 30L), lessThan));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 30L), lessThan));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 15L), lessThan));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(0L, 10L), lessThan));
}
@Test
public void testLessThanEquals() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.LONG,
"x", 15L, null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(20L, 30L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(15L, 30L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 30L), pred));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(10L, 15L), pred));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(0L, 10L), pred));
}
@Test
public void testIn() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(10L);
args.add(20L);
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
"x", null, args);
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(20L, 20L), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(30L, 30L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 30L), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(12L, 18L), pred));
}
@Test
public void testBetween() {
List<Object> args = new ArrayList<Object>();
args.add(10L);
args.add(20L);
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
"x", null, args);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(0L, 5L), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createIntStats(30L, 40L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(5L, 15L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(15L, 25L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(5L, 25L), pred));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(10L, 20L), pred));
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createIntStats(12L, 18L), pred));
// check with empty predicate list
args.clear();
pred = createPredicateLeaf
(PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.LONG,
"x", null, args);
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(0L, 5L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(30L, 40L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(5L, 15L), pred));
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createIntStats(10L, 20L), pred));
}
@Test
public void testIsNull() {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.LONG,
"x", null, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createIntStats(20L, 30L), pred));
}
@Test
public void testEqualsWithNullInStats() {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("d", "e", true), pred)); // before
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "c", true), pred)); // max
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("c", "d", true), pred)); // min
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createStringStats("c", "c", true), pred)); // same
}
@Test
public void testNullSafeEqualsWithNullInStats() {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO,
evaluateInteger(createStringStats("d", "e", true), pred)); // before
assertEquals(TruthValue.NO,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("b", "c", true), pred)); // max
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("c", "d", true), pred)); // min
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("c", "c", true), pred)); // same
}
@Test
public void testLessThanWithNullInStats() {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("d", "e", true), pred)); // before
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "c", true), pred)); // max
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("c", "d", true), pred)); // min
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.NO_NULL, // min, same stats
evaluateInteger(createStringStats("c", "c", true), pred));
}
@Test
public void testLessThanEqualsWithNullInStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.STRING,
"x", "c", null);
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("d", "e", true), pred)); // before
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createStringStats("b", "c", true), pred)); // max
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("c", "d", true), pred)); // min
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("c", "c", true), pred)); // same
}
@Test
public void testInWithNullInStats() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add("c");
args.add("f");
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
"x", null, args);
assertEquals(TruthValue.NO_NULL, // before & after
evaluateInteger(createStringStats("d", "e", true), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("e", "f", true), pred)); // max
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("c", "d", true), pred)); // min
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.YES_NULL,
evaluateInteger(createStringStats("c", "c", true), pred)); // same
}
@Test
public void testBetweenWithNullInStats() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add("c");
args.add("f");
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.BETWEEN, PredicateLeaf.Type.STRING,
"x", null, args);
assertEquals(TruthValue.YES_NULL, // before & after
evaluateInteger(createStringStats("d", "e", true), pred));
assertEquals(TruthValue.YES_NULL, // before & max
evaluateInteger(createStringStats("e", "f", true), pred));
assertEquals(TruthValue.NO_NULL, // before & before
evaluateInteger(createStringStats("h", "g", true), pred));
assertEquals(TruthValue.YES_NO_NULL, // before & min
evaluateInteger(createStringStats("f", "g", true), pred));
assertEquals(TruthValue.YES_NO_NULL, // before & middle
evaluateInteger(createStringStats("e", "g", true), pred));
assertEquals(TruthValue.YES_NULL, // min & after
evaluateInteger(createStringStats("c", "e", true), pred));
assertEquals(TruthValue.YES_NULL, // min & max
evaluateInteger(createStringStats("c", "f", true), pred));
assertEquals(TruthValue.YES_NO_NULL, // min & middle
evaluateInteger(createStringStats("c", "g", true), pred));
assertEquals(TruthValue.NO_NULL,
evaluateInteger(createStringStats("a", "b", true), pred)); // after
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("a", "c", true), pred)); // max
assertEquals(TruthValue.YES_NO_NULL,
evaluateInteger(createStringStats("b", "d", true), pred)); // middle
assertEquals(TruthValue.YES_NULL, // min & after, same stats
evaluateInteger(createStringStats("c", "c", true), pred));
}
@Test
public void testTimestampStatsOldFiles() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", Timestamp.valueOf("2000-01-01 00:00:00"), null);
OrcProto.ColumnStatistics cs = createTimestampStats("2000-01-01 00:00:00", "2001-01-01 00:00:00");
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestampBloomfilter(cs, pred, new BloomFilterUtf8(10000, 0.01), OrcFile.WriterVersion.ORC_101, false));
BloomFilterUtf8 bf = new BloomFilterUtf8(10, 0.05);
bf.addLong(getUtcTimestamp("2000-06-01 00:00:00"));
assertEquals(TruthValue.NO_NULL,
evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_135, false));
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_101, false));
}
@Test
public void testTimestampUTC() throws Exception {
DateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
f.setTimeZone(TimeZone.getTimeZone("UTC"));
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", new Timestamp(f.parse("2015-01-01 00:00:00").getTime()), null);
PredicateLeaf pred2 = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", new Timestamp(f.parse("2014-12-31 23:59:59").getTime()), null);
PredicateLeaf pred3 = createPredicateLeaf
(PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.TIMESTAMP,
"x", new Timestamp(f.parse("2016-01-01 00:00:01").getTime()), null);
OrcProto.ColumnStatistics cs = createTimestampStats("2015-01-01 00:00:00", "2016-01-01 00:00:00");
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestamp(cs, pred, true, true));
assertEquals(TruthValue.NO_NULL,
evaluateTimestamp(cs, pred2, true, true));
assertEquals(TruthValue.NO_NULL,
evaluateTimestamp(cs, pred3, true, true));
assertEquals(TruthValue.NO_NULL,
evaluateTimestampBloomfilter(cs, pred, new BloomFilterUtf8(10000, 0.01), OrcFile.WriterVersion.ORC_135, true));
assertEquals(TruthValue.NO_NULL,
evaluateTimestampBloomfilter(cs, pred2, new BloomFilterUtf8(10000, 0.01), OrcFile.WriterVersion.ORC_135, true));
BloomFilterUtf8 bf = new BloomFilterUtf8(10, 0.05);
bf.addLong(getUtcTimestamp("2015-06-01 00:00:00"));
assertEquals(TruthValue.NO_NULL,
evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_135, true));
bf.addLong(getUtcTimestamp("2015-01-01 00:00:00"));
assertEquals(TruthValue.YES_NO_NULL,
evaluateTimestampBloomfilter(cs, pred, bf, OrcFile.WriterVersion.ORC_135, true));
}
private static long getUtcTimestamp(String ts) {
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
dateFormat.setTimeZone(utcTz);
try {
return dateFormat.parse(ts).getTime();
} catch (ParseException e) {
throw new IllegalArgumentException("Can't parse " + ts, e);
}
}
@Test
public void testIsNullWithNullInStats() throws Exception {
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.STRING,
"x", null, null);
assertEquals(TruthValue.YES_NO,
evaluateInteger(createStringStats("c", "d", true), pred));
assertEquals(TruthValue.NO,
evaluateInteger(createStringStats("c", "d", false), pred));
}
@Test
public void testOverlap() throws Exception {
assertTrue(!RecordReaderUtils.overlap(0, 10, -10, -1));
assertTrue(RecordReaderUtils.overlap(0, 10, -1, 0));
assertTrue(RecordReaderUtils.overlap(0, 10, -1, 1));
assertTrue(RecordReaderUtils.overlap(0, 10, 2, 8));
assertTrue(RecordReaderUtils.overlap(0, 10, 5, 10));
assertTrue(RecordReaderUtils.overlap(0, 10, 10, 11));
assertTrue(RecordReaderUtils.overlap(0, 10, 0, 10));
assertTrue(RecordReaderUtils.overlap(0, 10, -1, 11));
assertTrue(!RecordReaderUtils.overlap(0, 10, 11, 12));
}
private static DiskRangeList diskRanges(Integer... points) {
DiskRangeList head = null, tail = null;
for(int i = 0; i < points.length; i += 2) {
DiskRangeList range = new DiskRangeList(points[i], points[i+1]);
if (tail == null) {
head = tail = range;
} else {
tail = tail.insertAfter(range);
}
}
return head;
}
@Test
public void testGetIndexPosition() throws Exception {
boolean uncompressed = false;
boolean compressed = true;
assertEquals(0, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.INT,
OrcProto.Stream.Kind.PRESENT, compressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.INT,
OrcProto.Stream.Kind.DATA, compressed, true));
assertEquals(3, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.INT,
OrcProto.Stream.Kind.DATA, uncompressed, true));
assertEquals(0, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.INT,
OrcProto.Stream.Kind.DATA, compressed, false));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DICTIONARY, TypeDescription.Category.STRING,
OrcProto.Stream.Kind.DATA, compressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.BINARY,
OrcProto.Stream.Kind.DATA, compressed, true));
assertEquals(3, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.BINARY,
OrcProto.Stream.Kind.DATA, uncompressed, true));
assertEquals(6, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.BINARY,
OrcProto.Stream.Kind.LENGTH, compressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.BINARY,
OrcProto.Stream.Kind.LENGTH, uncompressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.DECIMAL,
OrcProto.Stream.Kind.DATA, compressed, true));
assertEquals(3, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.DECIMAL,
OrcProto.Stream.Kind.DATA, uncompressed, true));
assertEquals(6, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.DECIMAL,
OrcProto.Stream.Kind.SECONDARY, compressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.DECIMAL,
OrcProto.Stream.Kind.SECONDARY, uncompressed, true));
assertEquals(4, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.TIMESTAMP,
OrcProto.Stream.Kind.DATA, compressed, true));
assertEquals(3, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.TIMESTAMP,
OrcProto.Stream.Kind.DATA, uncompressed, true));
assertEquals(7, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.TIMESTAMP,
OrcProto.Stream.Kind.SECONDARY, compressed, true));
assertEquals(5, RecordReaderUtils.getIndexPosition
(OrcProto.ColumnEncoding.Kind.DIRECT, TypeDescription.Category.TIMESTAMP,
OrcProto.Stream.Kind.SECONDARY, uncompressed, true));
}
@Test
public void testPartialPlan() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
MockDataReader dataReader = new MockDataReader(schema)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(entry(0, -1, -1, 0),
entry(100, -1, -1, 10000),
entry(200, -1, -1, 20000),
entry(300, -1, -1, 30000),
entry(400, -1, -1, 40000),
entry(500, -1, -1, 50000)))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(entry(0, -1, -1, 0),
entry(200, -1, -1, 20000),
entry(400, -1, -1, 40000),
entry(600, -1, -1, 60000),
entry(800, -1, -1, 80000),
entry(1000, -1, -1, 100000)))
.addStream(1, OrcProto.Stream.Kind.PRESENT, createDataStream(1, 1000))
.addStream(1, OrcProto.Stream.Kind.DATA, createDataStream(2, 99000))
.addStream(2, OrcProto.Stream.Kind.PRESENT, createDataStream(3, 2000))
.addStream(2, OrcProto.Stream.Kind.DATA, createDataStream(4, 198000))
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addStripeFooter(1000, null);
MockStripe stripe = dataReader.getStripe(0);
// get the start of the data streams
final long START = stripe.getStream(1, OrcProto.Stream.Kind.PRESENT).offset;
boolean[] columns = new boolean[]{true, true, false};
boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
// filter by rows and groups
StripePlanner planner = new StripePlanner(schema, new ReaderEncryption(),
dataReader, OrcFile.WriterVersion.ORC_14, false, Integer.MAX_VALUE);
planner.parseStripe(stripe, columns);
OrcIndex index = planner.readRowIndex(null, null);
BufferChunkList result = planner.readData(index, rowGroups, false);
assertEquals(START, result.get(0).getOffset());
assertEquals(1000, result.get(0).getLength());
assertEquals(START + 1000, result.get(1).getOffset());
assertEquals(20000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, result.get(1).getLength());
assertEquals(START + 41000, result.get(2).getOffset());
assertEquals(10000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP, result.get(2).getLength());
assertEquals(null, result.get(3));
// if we read no rows, don't read any bytes
rowGroups = new boolean[]{false, false, false, false, false, false};
result = planner.readData(index, rowGroups, false);
assertEquals(null, result.get(0));
// all rows, but only columns 0 and 2.
rowGroups = null;
columns = new boolean[]{true, false, true};
planner.parseStripe(stripe, columns).readRowIndex(null, index);
result = planner.readData(index, rowGroups, false);
assertEquals(START + 100000, result.get(0).getOffset());
assertEquals(2000, result.get(0).getLength());
assertEquals(START + 102000, result.get(1).getOffset());
assertEquals(198000, result.get(1).getLength());
assertEquals(null, result.get(2));
rowGroups = new boolean[]{false, true, false, false, false, false};
result = planner.readData(index, rowGroups, false);
assertEquals(START + 100200, result.get(0).getOffset());
assertEquals(1800, result.get(0).getLength());
assertEquals(START + 122000, result.get(1).getOffset());
assertEquals(20000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
result.get(1).getLength());
assertEquals(null, result.get(2));
rowGroups = new boolean[]{false, false, false, false, false, true};
columns = new boolean[]{true, true, true};
planner.parseStripe(stripe, columns).readRowIndex(null, index);
result = planner.readData(index, rowGroups, false);
assertEquals(START + 500, result.get(0).getOffset());
assertEquals(500, result.get(0).getLength());
assertEquals(START + 51000, result.get(1).getOffset());
assertEquals(49000, result.get(1).getLength());
assertEquals(START + 101000, result.get(2).getOffset());
assertEquals(1000, result.get(2).getLength());
assertEquals(START + 202000, result.get(3).getOffset());
assertEquals(98000, result.get(3).getLength());
assertEquals(null, result.get(4));
}
@Test
public void testPartialPlanCompressed() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
InStream.StreamOptions options =
new InStream.StreamOptions()
.withCodec(OrcCodecPool.getCodec(CompressionKind.ZLIB))
.withBufferSize(1024);
final int SLOP = 2 * (OutStream.HEADER_SIZE + options.getBufferSize());
MockDataReader dataReader = new MockDataReader(schema, options)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(options,
entry(0, -1, -1, -1, 0),
entry(100, -1, -1, -1, 10000),
entry(200, -1, -1, -1, 20000),
entry(300, -1, -1, -1, 30000),
entry(400, -1, -1, -1, 40000),
entry(500, -1, -1, -1, 50000)))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(options,
entry(0, -1, -1, -1, 0),
entry(200, -1, -1, -1, 20000),
entry(400, -1, -1, -1, 40000),
entry(600, -1, -1, -1, 60000),
entry(800, -1, -1, -1, 80000),
entry(1000, -1, -1, -1, 100000)))
.addStream(1, OrcProto.Stream.Kind.PRESENT, createDataStream(1, 1000))
.addStream(1, OrcProto.Stream.Kind.DATA, createDataStream(2, 99000))
.addStream(2, OrcProto.Stream.Kind.PRESENT, createDataStream(3, 2000))
.addStream(2, OrcProto.Stream.Kind.DATA, createDataStream(4, 198000))
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addStripeFooter(1000, null);
MockStripe stripe = dataReader.getStripe(0);
// get the start of the data streams
final long START = stripe.getStream(1, OrcProto.Stream.Kind.PRESENT).offset;
StripePlanner planner = new StripePlanner(schema, new ReaderEncryption(),
dataReader, OrcFile.WriterVersion.ORC_14, false, Integer.MAX_VALUE);
// filter by rows and groups
boolean[] columns = new boolean[]{true, true, false};
boolean[] rowGroups = new boolean[]{true, true, false, false, true, false};
planner.parseStripe(stripe, columns);
OrcIndex index = planner.readRowIndex(null, null);
BufferChunkList result = planner.readData(index, rowGroups, false);
assertEquals(START, result.get(0).getOffset());
assertEquals(1000, result.get(0).getLength());
assertEquals(START + 1000, result.get(1).getOffset());
assertEquals(20000 + SLOP, result.get(1).getLength());
assertEquals(START + 41000, result.get(2).getOffset());
assertEquals(10000 + SLOP, result.get(2).getLength());
assertEquals(null, result.get(3));
rowGroups = new boolean[]{false, false, false, false, false, true};
result = planner.readData(index, rowGroups, false);
assertEquals(START + 500, result.get(0).getOffset());
assertEquals(500, result.get(0).getLength());
assertEquals(START + 51000, result.get(1).getOffset());
assertEquals(49000, result.get(1).getLength());
assertEquals(null, result.get(2));
}
@Test
public void testPartialPlanString() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:string,y:int>");
MockDataReader dataReader =
new MockDataReader(schema)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(entry(0, -1, -1, 0),
entry(100, -1, -1, 10000),
entry(200, -1, -1, 20000),
entry(300, -1, -1, 30000),
entry(400, -1, -1, 40000),
entry(500, -1, -1, 50000)))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX,
createRowIndex(entry(0, -1, -1, 0),
entry(200, -1, -1, 20000),
entry(400, -1, -1, 40000),
entry(600, -1, -1, 60000),
entry(800, -1, -1, 80000),
entry(1000, -1, -1, 100000)))
.addStream(1, OrcProto.Stream.Kind.PRESENT, createDataStream(1, 1000))
.addStream(1, OrcProto.Stream.Kind.DATA, createDataStream(2, 94000))
.addStream(1, OrcProto.Stream.Kind.LENGTH, createDataStream(3, 2000))
.addStream(1, OrcProto.Stream.Kind.DICTIONARY_DATA, createDataStream(4, 3000))
.addStream(2, OrcProto.Stream.Kind.PRESENT, createDataStream(5, 2000))
.addStream(2, OrcProto.Stream.Kind.DATA, createDataStream(6, 198000))
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addEncoding(OrcProto.ColumnEncoding.Kind.DICTIONARY)
.addEncoding(OrcProto.ColumnEncoding.Kind.DIRECT)
.addStripeFooter(1000, null);
MockStripe stripe = dataReader.getStripe(0);
// get the start of the data streams
final long START = stripe.getStream(1, OrcProto.Stream.Kind.PRESENT).offset;
// filter by rows and groups
StripePlanner planner = new StripePlanner(schema, new ReaderEncryption(),
dataReader, OrcFile.WriterVersion.ORC_14, false, Integer.MAX_VALUE);
// filter by rows and groups
boolean[] columns = new boolean[]{true, true, false};
boolean[] rowGroups = new boolean[]{false, true, false, false, true, true};
planner.parseStripe(stripe, columns);
OrcIndex index = planner.readRowIndex(null, null);
BufferChunkList result = planner.readData(index, rowGroups, false);
assertEquals(START + 100, result.get(0).getOffset());
assertEquals(900, result.get(0).getLength());
assertEquals(START + 11000, result.get(1).getOffset());
assertEquals(10000 + RecordReaderUtils.WORST_UNCOMPRESSED_SLOP,
result.get(1).getLength());
assertEquals(START + 41000, result.get(2).getOffset());
assertEquals(54000, result.get(2).getLength());
assertEquals(START + 95000, result.get(3).getOffset());
assertEquals(2000, result.get(3).getLength());
assertEquals(START + 97000, result.get(4).getOffset());
assertEquals(3000, result.get(4).getLength());
assertEquals(null, result.get(5));
// Don't read anything if no groups are selected
rowGroups = new boolean[6];
result = planner.readData(index, rowGroups, false);
assertEquals(null, result.get(0));
}
@Test
public void testIntNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(15);
assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testIntEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(15);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testIntInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(15L);
args.add(19L);
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createIntStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(19);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong(15);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDoubleNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(15.0);
assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDoubleEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(15.0);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDoubleInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(15.0);
args.add(19.0);
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.FLOAT,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addDouble(i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDoubleStats(10.0, 100.0));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(19.0);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addDouble(15.0);
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testStringNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_15");
assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testStringEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_15");
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testStringInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add("str_15");
args.add("str_19");
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString("str_" + i);
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createStringStats("str_10", "str_200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_19");
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString("str_15");
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDateWritableNullSafeEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DATE, "x",
new DateWritable(15).get(), null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(15)).getDays());
assertEquals(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDateWritableEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DATE, "x",
new DateWritable(15).get(), null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(15)).getDays());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDateWritableInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(new DateWritable(15).get());
args.add(new DateWritable(19).get());
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DATE,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addLong((new DateWritable(i)).getDays());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDateStats(10, 100));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(19)).getDays());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addLong((new DateWritable(15)).getDays());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDecimalEqualsBloomFilter() throws Exception {
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.DECIMAL, "x",
new HiveDecimalWritable("15"),
null);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(15).toString());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testDecimalInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(new HiveDecimalWritable("15"));
args.add(new HiveDecimalWritable("19"));
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200"));
assertEquals(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(19).toString());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(15).toString());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testNullsInBloomFilter() throws Exception {
List<Object> args = new ArrayList<Object>();
args.add(new HiveDecimalWritable("15"));
args.add(null);
args.add(new HiveDecimalWritable("19"));
PredicateLeaf pred = createPredicateLeaf
(PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
"x", null, args);
BloomFilter bf = new BloomFilter(10000);
for (int i = 20; i < 1000; i++) {
bf.addString(HiveDecimal.create(i).toString());
}
ColumnStatistics cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200", false));
// hasNull is false, so bloom filter should return NO
assertEquals(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
cs = ColumnStatisticsImpl.deserialize(null, createDecimalStats("10", "200", true));
// hasNull is true, so bloom filter should return YES_NO_NULL
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(19).toString());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
bf.addString(HiveDecimal.create(15).toString());
assertEquals(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
}
@Test
public void testClose() throws Exception {
DataReader mockedDataReader = mock(DataReader.class);
DataReader cloned = mock(DataReader.class);
when(mockedDataReader.clone()).thenReturn(cloned);
closeMockedRecordReader(mockedDataReader);
verify(cloned, atLeastOnce()).close();
}
@Test
public void testCloseWithException() throws Exception {
DataReader mockedDataReader = mock(DataReader.class);
DataReader cloned = mock(DataReader.class);
when(mockedDataReader.clone()).thenReturn(cloned);
doThrow(IOException.class).when(cloned).close();
try {
closeMockedRecordReader(mockedDataReader);
fail("Exception should have been thrown when Record Reader was closed");
} catch (IOException expected) {
}
verify(cloned, atLeastOnce()).close();
}
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
private void closeMockedRecordReader(DataReader mockedDataReader) throws IOException {
Configuration conf = new Configuration();
Path path = new Path(workDir, "empty.orc");
FileSystem.get(conf).delete(path, true);
Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
.setSchema(TypeDescription.createLong()));
writer.close();
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
RecordReader recordReader = reader.rows(reader.options()
.dataReader(mockedDataReader));
recordReader.close();
}
static ByteBuffer createDataStream(int tag, int bytes) {
ByteBuffer result = ByteBuffer.allocate(bytes);
IntBuffer iBuf = result.asIntBuffer();
for(int i= 0; i < bytes; i += 4) {
iBuf.put((tag << 24) + i);
}
result.limit(bytes);
return result;
}
static OrcProto.RowIndexEntry entry(int... values) {
OrcProto.RowIndexEntry.Builder builder = OrcProto.RowIndexEntry.newBuilder();
for(int v: values) {
builder.addPositions(v);
}
return builder.build();
}
static ByteBuffer createRowIndex(InStream.StreamOptions options,
OrcProto.RowIndexEntry... entries
) throws IOException {
ByteBuffer uncompressed = createRowIndex(entries);
if (options.getCodec() != null) {
CompressionCodec codec = options.getCodec();
PhysicalFsWriter.BufferedStream buffer =
new PhysicalFsWriter.BufferedStream();
StreamOptions writerOptions = new StreamOptions(options.getBufferSize())
.withCodec(codec, codec.getDefaultOptions());
try (OutStream out = new OutStream("row index", writerOptions, buffer)) {
out.write(uncompressed.array(),
uncompressed.arrayOffset() + uncompressed.position(),
uncompressed.remaining());
out.flush();
}
return buffer.getByteBuffer();
} else {
return uncompressed;
}
}
static ByteBuffer createRowIndex(OrcProto.RowIndexEntry... entries) {
OrcProto.RowIndex.Builder builder = OrcProto.RowIndex.newBuilder();
for(OrcProto.RowIndexEntry entry: entries) {
builder.addEntry(entry);
}
return ByteBuffer.wrap(builder.build().toByteArray());
}
static ByteBuffer createRowIndex(int value) {
OrcProto.RowIndexEntry entry =
OrcProto.RowIndexEntry.newBuilder().addPositions(value).build();
return ByteBuffer.wrap(OrcProto.RowIndex.newBuilder().addEntry(entry)
.build().toByteArray());
}
static ByteBuffer createBloomFilter(int value) {
OrcProto.BloomFilter entry =
OrcProto.BloomFilter.newBuilder().setNumHashFunctions(value).build();
return ByteBuffer.wrap(OrcProto.BloomFilterIndex.newBuilder()
.addBloomFilter(entry).build().toByteArray());
}
static MockDataReader createOldBlooms(TypeDescription schema) {
return new MockDataReader(schema)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(10))
.addStream(1, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(11))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(20))
.addStream(2, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(21))
.addStream(3, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(30))
.addStream(3, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(31))
.addStripeFooter(1000, null);
}
static MockDataReader createMixedBlooms(TypeDescription schema) {
return new MockDataReader(schema)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(10))
.addStream(1, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(11))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(20))
.addStream(2, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(21))
.addStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, createBloomFilter(22))
.addStream(3, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(30))
.addStream(3, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(31))
.addStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, createBloomFilter(32))
.addStripeFooter(1000, null);
}
static MockDataReader createNewBlooms(TypeDescription schema) {
return new MockDataReader(schema)
.addStream(1, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(10))
.addStream(1, OrcProto.Stream.Kind.BLOOM_FILTER, createBloomFilter(11))
.addStream(2, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(20))
.addStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, createBloomFilter(22))
.addStream(3, OrcProto.Stream.Kind.ROW_INDEX, createRowIndex(30))
.addStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, createBloomFilter(32))
.addStripeFooter(1000, null);
}
@Test
public void testOldBloomFilters() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
MockDataReader dataReader = createOldBlooms(schema);
MockStripe stripe = dataReader.getStripe(0);
// use old blooms
ReaderEncryption encryption = new ReaderEncryption();
StripePlanner planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, false, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, false, true});
OrcIndex index =
planner.readRowIndex(new boolean[]{false, true, false, true}, null);
OrcProto.Stream.Kind[] bloomFilterKinds = index.getBloomFilterKinds();
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
// ignore non-utf8 bloom filter
dataReader.resetCounts();
Arrays.fill(bloomFilterKinds, null);
planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, true, false});
planner.readRowIndex(new boolean[]{false, true, true, false}, index);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(null, bloomFilterKinds[2]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
// check that we are handling the post hive-12055 strings correctly
dataReader.resetCounts();
Arrays.fill(bloomFilterKinds, null);
planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_12055, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, true, true});
planner.readRowIndex(new boolean[]{false, true, true, true}, index);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(null, bloomFilterKinds[2]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[3]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
// ignore non-utf8 bloom filter on decimal
dataReader.resetCounts();
Arrays.fill(bloomFilterKinds, null);
planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, false, true, false});
planner.readRowIndex(new boolean[]{false, false, true, false}, index);
assertEquals(null, bloomFilterKinds[1]);
assertEquals(null, bloomFilterKinds[2]);
assertEquals(0, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
}
@Test
public void testCompatibleBloomFilters() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
MockDataReader dataReader = createMixedBlooms(schema);
MockStripe stripe = dataReader.getStripe(0);
// use old bloom filters
ReaderEncryption encryption = new ReaderEncryption();
StripePlanner planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, false, true});
OrcIndex index =
planner.readRowIndex(new boolean[]{false, true, false, true}, null);
OrcProto.Stream.Kind[] bloomFilterKinds = index.getBloomFilterKinds();
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
// ignore non-utf8 bloom filter
Arrays.fill(bloomFilterKinds, null);
dataReader.resetCounts();
planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, true, false});
planner.readRowIndex(new boolean[]{false, true, true, false}, index);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
}
@Test
public void testNewBloomFilters() throws Exception {
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:decimal(10,2),z:string>");
MockDataReader dataReader = createNewBlooms(schema);
MockStripe stripe = dataReader.getStripe(0);
// use old bloom filters
ReaderEncryption encryption = new ReaderEncryption();
StripePlanner planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, false, true});
OrcIndex index =
planner.readRowIndex(new boolean[]{false, true, false, true}, null);
OrcProto.Stream.Kind[] bloomFilterKinds = index.getBloomFilterKinds();
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[3]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
// ignore non-utf8 bloom filter
Arrays.fill(bloomFilterKinds, null);
dataReader.resetCounts();
planner = new StripePlanner(schema, encryption, dataReader,
OrcFile.WriterVersion.HIVE_4243, true, Integer.MAX_VALUE);
planner.parseStripe(stripe, new boolean[]{true, true, true, false});
planner.readRowIndex(new boolean[]{false, true, true, false}, index);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER, bloomFilterKinds[1]);
assertEquals(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8, bloomFilterKinds[2]);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(1, OrcProto.Stream.Kind.BLOOM_FILTER).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(1, stripe.getStream(2, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.ROW_INDEX).readCount);
assertEquals(0, stripe.getStream(3, OrcProto.Stream.Kind.BLOOM_FILTER_UTF8).readCount);
}
static OrcProto.RowIndexEntry createIndexEntry(Long min, Long max) {
return OrcProto.RowIndexEntry.newBuilder()
.setStatistics(createIntStats(min, max)).build();
}
@Test
public void testPickRowGroups() throws Exception {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
SchemaEvolution evolution = new SchemaEvolution(schema, schema,
new Reader.Options(conf));
SearchArgument sarg =
SearchArgumentFactory.newBuilder()
.startAnd()
.equals("x", PredicateLeaf.Type.LONG, 100L)
.equals("y", PredicateLeaf.Type.LONG, 10L)
.end().build();
RecordReaderImpl.SargApplier applier =
new RecordReaderImpl.SargApplier(sarg, 1000, evolution,
OrcFile.WriterVersion.ORC_135, false, false, false);
OrcProto.StripeInformation stripe =
OrcProto.StripeInformation.newBuilder().setNumberOfRows(4000).build();
OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
indexes[1] = OrcProto.RowIndex.newBuilder()
.addEntry(createIndexEntry(0L, 10L))
.addEntry(createIndexEntry(100L, 200L))
.addEntry(createIndexEntry(300L, 500L))
.addEntry(createIndexEntry(100L, 100L))
.build();
indexes[2] = OrcProto.RowIndex.newBuilder()
.addEntry(createIndexEntry(0L, 9L))
.addEntry(createIndexEntry(11L, 20L))
.addEntry(createIndexEntry(10L, 10L))
.addEntry(createIndexEntry(0L, 100L))
.build();
List<OrcProto.ColumnEncoding> encodings = new ArrayList<>();
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build());
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build());
boolean[] rows = applier.pickRowGroups(
new ReaderImpl.StripeInformationImpl(stripe, 1, -1, null),
indexes, null, encodings, null, false);
assertEquals(4, rows.length);
assertEquals(false, rows[0]);
assertEquals(false, rows[1]);
assertEquals(false, rows[2]);
assertEquals(true, rows[3]);
assertEquals(0, applier.getExceptionCount()[0]);
assertEquals(0, applier.getExceptionCount()[1]);
}
@Test
public void testPickRowGroupsError() throws Exception {
Configuration conf = new Configuration();
TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
SchemaEvolution evolution = new SchemaEvolution(schema, schema,
new Reader.Options(conf));
SearchArgument sarg =
SearchArgumentFactory.newBuilder()
.startAnd()
.equals("x", PredicateLeaf.Type.DATE, Date.valueOf("2017-01-02"))
.equals("y", PredicateLeaf.Type.LONG, 10L)
.end().build();
RecordReaderImpl.SargApplier applier =
new RecordReaderImpl.SargApplier(sarg, 1000, evolution,
OrcFile.WriterVersion.ORC_135, false, false, false);
OrcProto.StripeInformation stripe =
OrcProto.StripeInformation.newBuilder().setNumberOfRows(3000).build();
OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
indexes[1] = OrcProto.RowIndex.newBuilder()
.addEntry(createIndexEntry(0L, 10L))
.addEntry(createIndexEntry(10L, 20L))
.addEntry(createIndexEntry(20L, 30L))
.build();
indexes[2] = OrcProto.RowIndex.newBuilder()
.addEntry(createIndexEntry(0L, 9L))
.addEntry(createIndexEntry(10L, 20L))
.addEntry(createIndexEntry(0L, 30L))
.build();
List<OrcProto.ColumnEncoding> encodings = new ArrayList<>();
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build());
encodings.add(OrcProto.ColumnEncoding.newBuilder()
.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build());
boolean[] rows = applier.pickRowGroups(
new ReaderImpl.StripeInformationImpl(stripe, 1, -1, null),
indexes, null, encodings, null, false);
assertEquals(3, rows.length);
assertEquals(false, rows[0]);
assertEquals(true, rows[1]);
assertEquals(true, rows[2]);
assertEquals(1, applier.getExceptionCount()[0]);
assertEquals(0, applier.getExceptionCount()[1]);
}
@Test
public void testPositionalEvolutionAddColumnPPD() throws IOException {
Reader.Options opts = new Reader.Options();
opts.forcePositionalEvolution(true);
TypeDescription file = TypeDescription.fromString("struct<x:int>");
// new column added on reader side
TypeDescription read = TypeDescription.fromString("struct<x:int,y:boolean>");
opts.include(includeAll(read));
SchemaEvolution evo = new SchemaEvolution(file, read, opts);
SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd()
.equals("y", PredicateLeaf.Type.BOOLEAN, true).end().build();
RecordReaderImpl.SargApplier applier =
new RecordReaderImpl.SargApplier(sarg, 1000, evo,
OrcFile.WriterVersion.ORC_135, false, false, false);
OrcProto.StripeInformation stripe =
OrcProto.StripeInformation.newBuilder().setNumberOfRows(2000).build();
OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3];
indexes[1] = OrcProto.RowIndex.newBuilder() // index for original x column
.addEntry(createIndexEntry(0L, 10L))
.addEntry(createIndexEntry(100L, 200L))
.build();
indexes[2] = null; // no-op, just for clarifying that new reader column doesn't have an index
List<OrcProto.ColumnEncoding> encodings = new ArrayList<>();
encodings.add(OrcProto.ColumnEncoding.newBuilder().setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build());
boolean[] rows = applier.pickRowGroups(new ReaderImpl.StripeInformationImpl(stripe, 1, -1, null),
indexes, null, encodings, null, false);
assertEquals(SargApplier.READ_ALL_RGS, rows); //cannot filter for new column, return all rows
}
private boolean[] includeAll(TypeDescription readerType) {
int numColumns = readerType.getMaximumId() + 1;
boolean[] result = new boolean[numColumns];
Arrays.fill(result, true);
return result;
}
@Test
public void testSkipDataReaderOpen() throws Exception {
IOException ioe = new IOException("Don't open when there is no stripe");
DataReader mockedDataReader = mock(DataReader.class);
doThrow(ioe).when(mockedDataReader).open();
when(mockedDataReader.clone()).thenReturn(mockedDataReader);
doNothing().when(mockedDataReader).close();
Configuration conf = new Configuration();
Path path = new Path(workDir, "empty.orc");
FileSystem.get(conf).delete(path, true);
OrcFile.WriterOptions options = OrcFile.writerOptions(conf).setSchema(TypeDescription.createLong());
Writer writer = OrcFile.createWriter(path, options);
writer.close();
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
Reader.Options readerOptions = reader.options().dataReader(mockedDataReader);
RecordReader recordReader = reader.rows(readerOptions);
recordReader.close();
}
@Test
public void testCloseAtConstructorException() throws Exception {
Configuration conf = new Configuration();
Path path = new Path(workDir, "oneRow.orc");
FileSystem.get(conf).delete(path, true);
TypeDescription schema = TypeDescription.createLong();
OrcFile.WriterOptions options = OrcFile.writerOptions(conf).setSchema(schema);
Writer writer = OrcFile.createWriter(path, options);
VectorizedRowBatch writeBatch = schema.createRowBatch();
int row = writeBatch.size++;
((LongColumnVector) writeBatch.cols[0]).vector[row] = 0;
writer.addRowBatch(writeBatch);
writer.close();
DataReader mockedDataReader = mock(DataReader.class);
when(mockedDataReader.clone()).thenReturn(mockedDataReader);
doThrow(new IOException()).when(mockedDataReader).readStripeFooter(any());
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
Reader.Options readerOptions = reader.options().dataReader(mockedDataReader);
boolean isCalled = false;
try {
reader.rows(readerOptions);
} catch (IOException ie) {
isCalled = true;
}
assertTrue(isCalled);
verify(mockedDataReader, times(1)).close();
}
}