| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.iceberg.parquet; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.util.UUID; |
| import java.util.stream.Collectors; |
| import java.util.stream.IntStream; |
| import org.apache.avro.generic.GenericData.Record; |
| import org.apache.avro.generic.GenericRecordBuilder; |
| import org.apache.iceberg.Files; |
| import org.apache.iceberg.Schema; |
| import org.apache.iceberg.TestHelpers; |
| import org.apache.iceberg.avro.AvroSchemaUtil; |
| import org.apache.iceberg.exceptions.ValidationException; |
| import org.apache.iceberg.expressions.Expression; |
| import org.apache.iceberg.io.FileAppender; |
| import org.apache.iceberg.io.InputFile; |
| import org.apache.iceberg.io.OutputFile; |
| import org.apache.iceberg.types.Types; |
| import org.apache.iceberg.types.Types.DoubleType; |
| import org.apache.iceberg.types.Types.FloatType; |
| import org.apache.iceberg.types.Types.IntegerType; |
| import org.apache.iceberg.types.Types.LongType; |
| import org.apache.iceberg.types.Types.StringType; |
| import org.apache.parquet.column.page.DictionaryPageReadStore; |
| import org.apache.parquet.hadoop.ParquetFileReader; |
| import org.apache.parquet.hadoop.metadata.BlockMetaData; |
| import org.apache.parquet.schema.MessageType; |
| import org.junit.Assert; |
| import org.junit.Before; |
| import org.junit.Rule; |
| import org.junit.Test; |
| import org.junit.rules.TemporaryFolder; |
| |
| import static org.apache.iceberg.avro.AvroSchemaUtil.convert; |
| import static org.apache.iceberg.expressions.Expressions.and; |
| import static org.apache.iceberg.expressions.Expressions.equal; |
| import static org.apache.iceberg.expressions.Expressions.greaterThan; |
| import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; |
| import static org.apache.iceberg.expressions.Expressions.in; |
| import static org.apache.iceberg.expressions.Expressions.isNaN; |
| import static org.apache.iceberg.expressions.Expressions.isNull; |
| import static org.apache.iceberg.expressions.Expressions.lessThan; |
| import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; |
| import static org.apache.iceberg.expressions.Expressions.not; |
| import static org.apache.iceberg.expressions.Expressions.notEqual; |
| import static org.apache.iceberg.expressions.Expressions.notIn; |
| import static org.apache.iceberg.expressions.Expressions.notNaN; |
| import static org.apache.iceberg.expressions.Expressions.notNull; |
| import static org.apache.iceberg.expressions.Expressions.or; |
| import static org.apache.iceberg.expressions.Expressions.startsWith; |
| import static org.apache.iceberg.types.Types.NestedField.optional; |
| import static org.apache.iceberg.types.Types.NestedField.required; |
| |
| public class TestDictionaryRowGroupFilter { |
| |
| private static final Types.StructType structFieldType = |
| Types.StructType.of(Types.NestedField.required(9, "int_field", IntegerType.get())); |
| |
| private static final Schema SCHEMA = new Schema( |
| required(1, "id", IntegerType.get()), |
| optional(2, "no_stats", StringType.get()), |
| required(3, "required", StringType.get()), |
| optional(4, "all_nulls", LongType.get()), |
| optional(5, "some_nulls", StringType.get()), |
| optional(6, "no_nulls", StringType.get()), |
| optional(7, "non_dict", StringType.get()), |
| optional(8, "struct_not_null", structFieldType), |
| optional(10, "not_in_file", FloatType.get()), |
| optional(11, "all_nans", DoubleType.get()), |
| optional(12, "some_nans", FloatType.get()), |
| optional(13, "no_nans", DoubleType.get()) |
| ); |
| |
| private static final Types.StructType _structFieldType = |
| Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get())); |
| |
| private static final Schema FILE_SCHEMA = new Schema( |
| required(1, "_id", IntegerType.get()), |
| optional(2, "_no_stats", StringType.get()), |
| required(3, "_required", StringType.get()), |
| optional(4, "_all_nulls", LongType.get()), |
| optional(5, "_some_nulls", StringType.get()), |
| optional(6, "_no_nulls", StringType.get()), |
| optional(7, "_non_dict", StringType.get()), |
| optional(8, "_struct_not_null", _structFieldType), |
| optional(11, "_all_nans", DoubleType.get()), |
| optional(12, "_some_nans", FloatType.get()), |
| optional(13, "_no_nans", DoubleType.get()) |
| |
| ); |
| |
| private static final String TOO_LONG_FOR_STATS; |
| |
| static { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < 200; i += 1) { |
| sb.append(UUID.randomUUID().toString()); |
| } |
| TOO_LONG_FOR_STATS = sb.toString(); |
| } |
| |
| private static final int INT_MIN_VALUE = 30; |
| private static final int INT_MAX_VALUE = 79; |
| |
| private MessageType parquetSchema = null; |
| private BlockMetaData rowGroupMetadata = null; |
| private DictionaryPageReadStore dictionaryStore = null; |
| |
| @Rule |
| public TemporaryFolder temp = new TemporaryFolder(); |
| |
| @Before |
| public void createInputFile() throws IOException { |
| File parquetFile = temp.newFile(); |
| Assert.assertTrue(parquetFile.delete()); |
| |
| // build struct field schema |
| org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType); |
| |
| OutputFile outFile = Files.localOutput(parquetFile); |
| try (FileAppender<Record> appender = Parquet.write(outFile) |
| .schema(FILE_SCHEMA) |
| .build()) { |
| GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); |
| // create 20 copies of each record to ensure dictionary-encoding |
| for (int copy = 0; copy < 20; copy += 1) { |
| // create 50 records |
| for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { |
| builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 |
| builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats |
| builder.set("_required", "req"); // required, always non-null |
| builder.set("_all_nulls", null); // never non-null |
| builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values |
| builder.set("_no_nulls", ""); // optional, but always non-null |
| builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded |
| builder.set("_all_nans", Double.NaN); // never non-nan |
| builder.set("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values |
| builder.set("_no_nans", 3D); // optional, but always non-nan |
| |
| Record structNotNull = new Record(structSchema); |
| structNotNull.put("_int_field", INT_MIN_VALUE + i); |
| builder.set("_struct_not_null", structNotNull); // struct with int |
| |
| appender.add(builder.build()); |
| } |
| } |
| } |
| |
| InputFile inFile = Files.localInput(parquetFile); |
| |
| ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile)); |
| |
| Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); |
| rowGroupMetadata = reader.getRowGroups().get(0); |
| parquetSchema = reader.getFileMetaData().getSchema(); |
| dictionaryStore = reader.getNextDictionaryReader(); |
| } |
| |
| @Test |
| public void testAssumptions() { |
| // this case validates that other cases don't need to test expressions with null literals. |
| TestHelpers.assertThrows("Should reject null literal in equal expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> equal("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in notEqual expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> notEqual("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in lessThan expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> lessThan("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in lessThanOrEqual expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> lessThanOrEqual("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in greaterThan expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> greaterThan("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in greaterThanOrEqual expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> greaterThanOrEqual("col", null)); |
| TestHelpers.assertThrows("Should reject null literal in startsWith expression", |
| NullPointerException.class, |
| "Cannot create expression literal from null", |
| () -> startsWith("col", null)); |
| } |
| |
| @Test |
| public void testAllNulls() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("all_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("some_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("no_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("struct_not_null")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| } |
| |
| @Test |
| public void testNoNulls() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("all_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("some_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("no_nulls")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("struct_not_null")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary filter doesn't help", shouldRead); |
| } |
| |
| @Test |
| public void testRequiredColumn() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("required")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: required columns are always non-null", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("required")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); |
| } |
| |
| @Test |
| public void testIsNaNs() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("all_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: all_nans column will contain NaN", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("some_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: some_nans column will contain NaN", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, isNaN("no_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: no_nans column will not contain NaN", shouldRead); |
| } |
| |
| @Test |
| public void testNotNaNs() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("all_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: all_nans column will not contain non-NaN", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("some_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: some_nans column will contain non-NaN", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notNaN("no_nans")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: no_nans column will contain non-NaN", shouldRead); |
| } |
| |
| @Test |
| public void testStartsWith() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("non_dict", "re")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: no dictionary", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "re")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "req")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_stats", UUID.randomUUID().toString())) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: no stats but dictionary is present", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "reqs")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: no match in dictionary", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: no match in dictionary", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: no match in dictionary", shouldRead); |
| } |
| |
| @Test |
| public void testMissingColumn() { |
| TestHelpers.assertThrows("Should complain about missing column in expression", |
| ValidationException.class, "Cannot find field 'missing'", |
| () -> new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("missing", 5)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore)); |
| } |
| |
| @Test |
| public void testColumnNotInFile() { |
| Expression[] exprs = new Expression[] { |
| lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), |
| equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), |
| greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file"), |
| isNull("not_in_file"), notEqual("not_in_file", 1.0f) |
| }; |
| |
| for (Expression expr : exprs) { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); |
| } |
| } |
| |
| @Test |
| public void testColumnFallbackOrNotDictionaryEncoded() { |
| Expression[] exprs = new Expression[] { |
| lessThan("non_dict", "a"), lessThanOrEqual("non_dict", "a"), equal("non_dict", "a"), |
| greaterThan("non_dict", "a"), greaterThanOrEqual("non_dict", "a"), notNull("non_dict"), |
| isNull("non_dict"), notEqual("non_dict", "a") |
| }; |
| |
| for (Expression expr : exprs) { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, expr) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: dictionary cannot be found: " + expr, shouldRead); |
| } |
| } |
| |
| @Test |
| public void testMissingStats() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("no_stats", "a")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: stats are missing but dictionary is present", shouldRead); |
| } |
| |
| @Test |
| public void testNot() { |
| // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(lessThan("id", INT_MIN_VALUE - 25))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: not(false)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(greaterThan("id", INT_MIN_VALUE - 25))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: not(true)", shouldRead); |
| } |
| |
| @Test |
| public void testAnd() { |
| // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: and(false, true)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: and(false, false)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: and(true, true)", shouldRead); |
| } |
| |
| @Test |
| public void testOr() { |
| // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: or(false, false)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: or(false, true)", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerLt() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE - 25)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MIN_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerLtEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 25)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: many possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerGt() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("id", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerGtEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("id", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 25)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("id", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerNotEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 25)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerNotEqRewritten() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 25))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE - 1))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MIN_VALUE))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE - 4))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 1))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, not(equal("id", INT_MAX_VALUE + 6))) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| } |
| |
| @Test |
| public void testStringNotEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: contains null != 'some'", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_nulls", "")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: contains only ''", shouldRead); |
| } |
| |
| @Test |
| public void testStructFieldLt() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (30 is not < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("struct_not_null.int_field", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testStructFieldLtEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (5 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range below lower bound (29 < 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: many possible ids", shouldRead); |
| |
| } |
| |
| @Test |
| public void testStructFieldGt() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThan("struct_not_null.int_field", INT_MAX_VALUE) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (79 is not > 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testStructFieldGtEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (85 < 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id range above upper bound (80 > 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: one possible id", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: may possible ids", shouldRead); |
| } |
| |
| @Test |
| public void testStructFieldEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| equal("struct_not_null.int_field", INT_MIN_VALUE - 25) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, equal("struct_not_null.int_field", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound", shouldRead); |
| } |
| |
| @Test |
| public void testStructFieldNotEq() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id between lower and upper bounds", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("id", INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound", shouldRead); |
| } |
| |
| @Test |
| public void testCaseInsensitive() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("no_Nulls", ""), false) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should skip: contains only ''", shouldRead); |
| } |
| |
| @Test |
| public void testMissingDictionaryPageForColumn() { |
| TestHelpers.assertThrows("Should complain about missing dictionary", |
| IllegalStateException.class, "Failed to read required dictionary page for id: 5", |
| () -> new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("some_nulls", "some")) |
| .shouldRead(parquetSchema, rowGroupMetadata, descriptor -> null)); |
| } |
| |
| @Test |
| public void testIntegerIn() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", |
| shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", |
| shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: in set is a subset of the dictionary", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound (80 > 79, 81 > 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", |
| shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| in("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: the dictionary is a subset of the in set", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| in("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1).boxed().collect(Collectors.toList())) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: the dictionary is equal to the in set", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("all_nulls", 1, 2)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: in on all nulls column (isFallback to be true) ", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "some")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: in on some nulls column", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("some_nulls", "aaa", "bbb")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: some_nulls values are not within the set", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "bbb")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: in on no nulls column (empty string is not within the set)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, in("no_nulls", "aaa", "")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: in on no nulls column (empty string is within the set)", shouldRead); |
| } |
| |
| @Test |
| public void testIntegerNotIn() { |
| boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound (5 < 30, 6 < 30). The two sets are disjoint.", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id below lower bound (28 < 30, 29 < 30). The two sets are disjoint.", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to lower bound (30 == 30)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: the notIn set is a subset of the dictionary", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id equal to upper bound (79 == 79)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound (80 > 79, 81 > 79). The two sets are disjoint.", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: id above upper bound (85 > 79, 86 > 79). The two sets are disjoint.", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| notIn("id", IntStream.range(INT_MIN_VALUE - 10, INT_MAX_VALUE + 10).boxed().collect(Collectors.toList())) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: the dictionary is a subset of the notIn set", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, |
| notIn("id", IntStream.range(INT_MIN_VALUE, INT_MAX_VALUE + 1).boxed().collect(Collectors.toList())) |
| ).shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: the dictionary is equal to the notIn set", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("all_nulls", 1, 2)) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: notIn on all nulls column", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("some_nulls", "aaa", "bbb")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: notIn on some nulls column (any null matches the notIn)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "bbb")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertTrue("Should read: notIn on no nulls column (empty string is not within the set)", shouldRead); |
| |
| shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("no_nulls", "aaa", "")) |
| .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); |
| Assert.assertFalse("Should not read: notIn on no nulls column (empty string is within the set)", shouldRead); |
| } |
| } |