blob: d8b440791424c507408c519a812bea2aaa4d119f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.filter2.statisticslevel;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.parquet.io.api.Binary;
import org.junit.Test;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.statistics.DoubleStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.filter2.predicate.LogicalInverseRewriter;
import org.apache.parquet.filter2.predicate.Operators.BinaryColumn;
import org.apache.parquet.filter2.predicate.Operators.DoubleColumn;
import org.apache.parquet.filter2.predicate.Operators.IntColumn;
import org.apache.parquet.filter2.predicate.Statistics;
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
import static org.apache.parquet.io.api.Binary.fromString;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.apache.parquet.filter2.predicate.FilterApi.and;
import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn;
import static org.apache.parquet.filter2.predicate.FilterApi.eq;
import static org.apache.parquet.filter2.predicate.FilterApi.gt;
import static org.apache.parquet.filter2.predicate.FilterApi.gtEq;
import static org.apache.parquet.filter2.predicate.FilterApi.intColumn;
import static org.apache.parquet.filter2.predicate.FilterApi.lt;
import static org.apache.parquet.filter2.predicate.FilterApi.ltEq;
import static org.apache.parquet.filter2.predicate.FilterApi.not;
import static org.apache.parquet.filter2.predicate.FilterApi.notEq;
import static org.apache.parquet.filter2.predicate.FilterApi.or;
import static org.apache.parquet.filter2.predicate.FilterApi.userDefined;
import static org.apache.parquet.filter2.statisticslevel.StatisticsFilter.canDrop;
public class TestStatisticsFilter {
private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
PrimitiveTypeName.INT32,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
stats,
0L, 0L, valueCount, 0L, 0L);
}
private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) {
return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
PrimitiveTypeName.DOUBLE,
CompressionCodecName.GZIP,
new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
stats,
0L, 0L, valueCount, 0L, 0L);
}
private static final IntColumn intColumn = intColumn("int.column");
private static final DoubleColumn doubleColumn = doubleColumn("double.column");
private static final BinaryColumn missingColumn = binaryColumn("missing");
private static final IntColumn missingColumn2 = intColumn("missing.int");
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
private static final DoubleStatistics doubleStats = new DoubleStatistics();
static {
intStats.setMinMax(10, 100);
doubleStats.setMinMax(10, 100);
nullIntStats.setMinMax(0, 0);
nullIntStats.setNumNulls(177);
}
private static final List<ColumnChunkMetaData> columnMetas = Arrays.asList(
getIntColumnMeta(intStats, 177L),
getDoubleColumnMeta(doubleStats, 177L));
private static final List<ColumnChunkMetaData> nullColumnMetas = Arrays.asList(
getIntColumnMeta(nullIntStats, 177L), // column of all nulls
getDoubleColumnMeta(doubleStats, 177L));
@Test
public void testEqNonNull() {
assertTrue(canDrop(eq(intColumn, 9), columnMetas));
assertFalse(canDrop(eq(intColumn, 10), columnMetas));
assertFalse(canDrop(eq(intColumn, 100), columnMetas));
assertTrue(canDrop(eq(intColumn, 101), columnMetas));
// drop columns of all nulls when looking for non-null value
assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testEqNull() {
IntStatistics statsNoNulls = new IntStatistics();
statsNoNulls.setMinMax(10, 100);
statsNoNulls.setNumNulls(0);
IntStatistics statsSomeNulls = new IntStatistics();
statsSomeNulls.setMinMax(10, 100);
statsSomeNulls.setNumNulls(3);
assertTrue(canDrop(eq(intColumn, null), Arrays.asList(
getIntColumnMeta(statsNoNulls, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(eq(intColumn, null), Arrays.asList(
getIntColumnMeta(statsSomeNulls, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(eq(missingColumn, null), columnMetas));
}
@Test
public void testNotEqNonNull() {
assertFalse(canDrop(notEq(intColumn, 9), columnMetas));
assertFalse(canDrop(notEq(intColumn, 10), columnMetas));
assertFalse(canDrop(notEq(intColumn, 100), columnMetas));
assertFalse(canDrop(notEq(intColumn, 101), columnMetas));
IntStatistics allSevens = new IntStatistics();
allSevens.setMinMax(7, 7);
assertTrue(canDrop(notEq(intColumn, 7), Arrays.asList(
getIntColumnMeta(allSevens, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
allSevens.setNumNulls(100L);
assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
getIntColumnMeta(allSevens, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
allSevens.setNumNulls(177L);
assertFalse(canDrop(notEq(intColumn, 7), Arrays.asList(
getIntColumnMeta(allSevens, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(notEq(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testNotEqNull() {
IntStatistics statsNoNulls = new IntStatistics();
statsNoNulls.setMinMax(10, 100);
statsNoNulls.setNumNulls(0);
IntStatistics statsSomeNulls = new IntStatistics();
statsSomeNulls.setMinMax(10, 100);
statsSomeNulls.setNumNulls(3);
IntStatistics statsAllNulls = new IntStatistics();
statsAllNulls.setMinMax(0, 0);
statsAllNulls.setNumNulls(177);
assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
getIntColumnMeta(statsNoNulls, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(notEq(intColumn, null), Arrays.asList(
getIntColumnMeta(statsSomeNulls, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(notEq(intColumn, null), Arrays.asList(
getIntColumnMeta(statsAllNulls, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(notEq(missingColumn, null), columnMetas));
}
@Test
public void testLt() {
assertTrue(canDrop(lt(intColumn, 9), columnMetas));
assertTrue(canDrop(lt(intColumn, 10), columnMetas));
assertFalse(canDrop(lt(intColumn, 100), columnMetas));
assertFalse(canDrop(lt(intColumn, 101), columnMetas));
assertTrue(canDrop(lt(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testLtEq() {
assertTrue(canDrop(ltEq(intColumn, 9), columnMetas));
assertFalse(canDrop(ltEq(intColumn, 10), columnMetas));
assertFalse(canDrop(ltEq(intColumn, 100), columnMetas));
assertFalse(canDrop(ltEq(intColumn, 101), columnMetas));
assertTrue(canDrop(ltEq(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testGt() {
assertFalse(canDrop(gt(intColumn, 9), columnMetas));
assertFalse(canDrop(gt(intColumn, 10), columnMetas));
assertTrue(canDrop(gt(intColumn, 100), columnMetas));
assertTrue(canDrop(gt(intColumn, 101), columnMetas));
assertTrue(canDrop(gt(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testGtEq() {
assertFalse(canDrop(gtEq(intColumn, 9), columnMetas));
assertFalse(canDrop(gtEq(intColumn, 10), columnMetas));
assertFalse(canDrop(gtEq(intColumn, 100), columnMetas));
assertTrue(canDrop(gtEq(intColumn, 101), columnMetas));
assertTrue(canDrop(gtEq(intColumn, 0), nullColumnMetas));
assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas));
assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas));
}
@Test
public void testAnd() {
FilterPredicate yes = eq(intColumn, 9);
FilterPredicate no = eq(doubleColumn, 50D);
assertTrue(canDrop(and(yes, yes), columnMetas));
assertTrue(canDrop(and(yes, no), columnMetas));
assertTrue(canDrop(and(no, yes), columnMetas));
assertFalse(canDrop(and(no, no), columnMetas));
}
@Test
public void testOr() {
FilterPredicate yes = eq(intColumn, 9);
FilterPredicate no = eq(doubleColumn, 50D);
assertTrue(canDrop(or(yes, yes), columnMetas));
assertFalse(canDrop(or(yes, no), columnMetas));
assertFalse(canDrop(or(no, yes), columnMetas));
assertFalse(canDrop(or(no, no), columnMetas));
}
public static class SevensAndEightsUdp extends UserDefinedPredicate<Integer> {
@Override
public boolean keep(Integer value) {
if (value == null) {
return true;
}
throw new RuntimeException("this method should not be called with value != null");
}
@Override
public boolean canDrop(Statistics<Integer> statistics) {
return statistics.getMin() == 7 && statistics.getMax() == 7;
}
@Override
public boolean inverseCanDrop(Statistics<Integer> statistics) {
return statistics.getMin() == 8 && statistics.getMax() == 8;
}
}
public static class DropNullUdp extends SevensAndEightsUdp {
@Override
public boolean keep(Integer value) {
if (value == null) {
return false;
}
throw new RuntimeException("this method should not be called with value != null");
}
}
@Test
public void testUdp() {
FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));
FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class)));
FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));
IntStatistics seven = new IntStatistics();
seven.setMinMax(7, 7);
IntStatistics eight = new IntStatistics();
eight.setMinMax(8, 8);
IntStatistics neither = new IntStatistics();
neither.setMinMax(1 , 2);
assertTrue(canDrop(pred, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(pred, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(pred, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(invPred, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(invPred, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(invPred, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
// udpDropMissingColumn drops null column.
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
// invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
// udpKeepMissingColumn keeps null column.
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
// invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(seven, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(eight, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
}
@Test
public void testClearExceptionForNots() {
List<ColumnChunkMetaData> columnMetas = Arrays.asList(
getDoubleColumnMeta(new DoubleStatistics(), 0L),
getIntColumnMeta(new IntStatistics(), 0L));
FilterPredicate pred = and(not(eq(doubleColumn, 12.0)), eq(intColumn, 17));
try {
canDrop(pred, columnMetas);
fail("This should throw");
} catch (IllegalArgumentException e) {
assertEquals("This predicate contains a not! Did you forget to run this predicate through LogicalInverseRewriter?"
+ " not(eq(double.column, 12.0))", e.getMessage());
}
}
}