PARQUET-791: Add missing column support for UserDefinedPredicate
This extends the fixing #354 to UserDefinedPredicate.
Author: Liang-Chi Hsieh <viirya@gmail.com>
Closes #389 from viirya/PARQUET-791 and squashes the following commits:
d6be37d [Liang-Chi Hsieh] Address comment.
7e929c3 [Liang-Chi Hsieh] PARQUET-791: Add missing column support for UserDefinedPredicate.
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
index b37297a..ac7132e 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java
@@ -328,18 +328,31 @@
Column<T> filterColumn = ud.getColumn();
ColumnChunkMetaData columnChunk = getColumnChunk(filterColumn.getColumnPath());
U udp = ud.getUserDefinedPredicate();
+
+ if (columnChunk == null) {
+ // the column isn't in this file so all values are null.
+ // lets run the udp with null value to see if it keeps null or not.
+ if (inverted) {
+ return udp.keep(null);
+ } else {
+ return !udp.keep(null);
+ }
+ }
+
Statistics<T> stats = columnChunk.getStatistics();
if (stats.isEmpty()) {
// we have no statistics available, we cannot drop any chunks
- return false;
+ return BLOCK_MIGHT_MATCH;
}
if (isAllNulls(columnChunk)) {
- // there is no min max, there is nothing
- // else we can say about this chunk, we
- // cannot drop it.
- return false;
+ // lets run the udp with null value to see if it keeps null or not.
+ if (inverted) {
+ return udp.keep(null);
+ } else {
+ return !udp.keep(null);
+ }
}
org.apache.parquet.filter2.predicate.Statistics<T> udpStats =
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
index b47ed69..d8b4407 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java
@@ -83,6 +83,7 @@
private static final IntColumn intColumn = intColumn("int.column");
private static final DoubleColumn doubleColumn = doubleColumn("double.column");
private static final BinaryColumn missingColumn = binaryColumn("missing");
+ private static final IntColumn missingColumn2 = intColumn("missing.int");
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
@@ -269,7 +270,10 @@
@Override
public boolean keep(Integer value) {
- throw new RuntimeException("this method should not be called");
+ if (value == null) {
+ return true;
+ }
+ throw new RuntimeException("this method should not be called with value != null");
}
@Override
@@ -283,11 +287,27 @@
}
}
+ public static class DropNullUdp extends SevensAndEightsUdp {
+ @Override
+ public boolean keep(Integer value) {
+ if (value == null) {
+ return false;
+ }
+ throw new RuntimeException("this method should not be called with value != null");
+ }
+ }
+
@Test
public void testUdp() {
FilterPredicate pred = userDefined(intColumn, SevensAndEightsUdp.class);
FilterPredicate invPred = LogicalInverseRewriter.rewrite(not(userDefined(intColumn, SevensAndEightsUdp.class)));
+ FilterPredicate udpDropMissingColumn = userDefined(missingColumn2, DropNullUdp.class);
+ FilterPredicate invUdpDropMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, DropNullUdp.class)));
+
+ FilterPredicate udpKeepMissingColumn = userDefined(missingColumn2, SevensAndEightsUdp.class);
+ FilterPredicate invUdpKeepMissingColumn = LogicalInverseRewriter.rewrite(not(userDefined(missingColumn2, SevensAndEightsUdp.class)));
+
IntStatistics seven = new IntStatistics();
seven.setMinMax(7, 7);
@@ -320,6 +340,58 @@
assertFalse(canDrop(invPred, Arrays.asList(
getIntColumnMeta(neither, 177L),
getDoubleColumnMeta(doubleStats, 177L))));
+
+ // udpDropMissingColumn drops null column.
+ assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(seven, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(eight, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertTrue(canDrop(udpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(neither, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ // invUdpDropMissingColumn (i.e., not(udpDropMissingColumn)) keeps null column.
+ assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(seven, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(eight, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertFalse(canDrop(invUdpDropMissingColumn, Arrays.asList(
+ getIntColumnMeta(neither, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ // udpKeepMissingColumn keeps null column.
+ assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(seven, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(eight, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertFalse(canDrop(udpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(neither, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ // invUdpKeepMissingColumn (i.e., not(udpKeepMissingColumn)) drops null column.
+ assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(seven, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(eight, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
+
+ assertTrue(canDrop(invUdpKeepMissingColumn, Arrays.asList(
+ getIntColumnMeta(neither, 177L),
+ getDoubleColumnMeta(doubleStats, 177L))));
}
@Test