blob: a7c77b111be414aba84ac6b2d4fcaf2ac33882a9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.orc;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.sql.Date;
import java.sql.Timestamp;
import java.time.Instant;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.TimeZone;
import java.util.UUID;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.Binder;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.types.Types;
import org.apache.orc.TypeDescription;
import org.apache.orc.storage.ql.io.sarg.PredicateLeaf.Type;
import org.apache.orc.storage.ql.io.sarg.SearchArgument;
import org.apache.orc.storage.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory;
import org.apache.orc.storage.serde2.io.HiveDecimalWritable;
import org.junit.Assert;
import org.junit.Test;
import static org.apache.iceberg.expressions.Expressions.and;
import static org.apache.iceberg.expressions.Expressions.equal;
import static org.apache.iceberg.expressions.Expressions.greaterThan;
import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual;
import static org.apache.iceberg.expressions.Expressions.in;
import static org.apache.iceberg.expressions.Expressions.isNaN;
import static org.apache.iceberg.expressions.Expressions.isNull;
import static org.apache.iceberg.expressions.Expressions.lessThan;
import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual;
import static org.apache.iceberg.expressions.Expressions.notEqual;
import static org.apache.iceberg.expressions.Expressions.notIn;
import static org.apache.iceberg.expressions.Expressions.notNaN;
import static org.apache.iceberg.expressions.Expressions.notNull;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
public class TestExpressionToSearchArgument {
@Test
public void testPrimitiveTypes() {
Schema schema = new Schema(
required(1, "int", Types.IntegerType.get()),
required(2, "long", Types.LongType.get()),
required(3, "float", Types.FloatType.get()),
required(4, "double", Types.DoubleType.get()),
required(5, "boolean", Types.BooleanType.get()),
required(6, "string", Types.StringType.get()),
required(7, "date", Types.DateType.get()),
required(8, "time", Types.TimeType.get()),
required(9, "tsTz", Types.TimestampType.withZone()),
required(10, "ts", Types.TimestampType.withoutZone()),
required(11, "decimal", Types.DecimalType.of(38, 2)),
required(12, "float2", Types.FloatType.get()),
required(13, "double2", Types.DoubleType.get())
);
Expression expr = and(
and(
and(lessThan("int", 1), lessThanOrEqual("long", 100)),
and(greaterThan("float", 5.0), greaterThanOrEqual("double", 500.0))
),
and(
and(equal("boolean", true), notEqual("string", "test")),
and(in("decimal", BigDecimal.valueOf(-12345, 2), BigDecimal.valueOf(12345, 2)), notIn("time", 100L, 200L))
),
and(isNaN("float2"), notNaN("double2"))
);
Expression boundFilter = Binder.bind(schema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.startAnd()
.lessThan("`int`", Type.LONG, 1L)
.lessThanEquals("`long`", Type.LONG, 100L)
.startNot().lessThanEquals("`float`", Type.FLOAT, 5.0).end()
.startNot().lessThan("`double`", Type.FLOAT, 500.0).end()
.equals("`boolean`", Type.BOOLEAN, true)
.startOr().isNull("`string`", Type.STRING).startNot().equals("`string`", Type.STRING, "test").end().end()
.in("`decimal`", Type.DECIMAL, new HiveDecimalWritable("-123.45"), new HiveDecimalWritable("123.45"))
.startOr().isNull("`time`", Type.LONG).startNot().in("`time`", Type.LONG, 100L, 200L).end().end()
.equals("`float2`", Type.FLOAT, Double.NaN)
.startOr().isNull("`double2`", Type.FLOAT).startNot().equals("`double2`", Type.FLOAT, Double.NaN).end().end()
.end()
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema));
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testTimezoneSensitiveTypes() {
TimeZone currentTz = TimeZone.getDefault();
try {
for (String timezone : new String[]{"America/New_York", "Asia/Kolkata", "UTC/Greenwich"}) {
TimeZone.setDefault(TimeZone.getTimeZone(timezone));
OffsetDateTime tsTzPredicate = OffsetDateTime.parse("2019-10-02T00:47:28.207366Z");
OffsetDateTime tsPredicate = OffsetDateTime.parse("1968-01-16T13:07:59.048625Z");
OffsetDateTime epoch = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC);
Schema schema = new Schema(
required(1, "date", Types.DateType.get()),
required(2, "tsTz", Types.TimestampType.withZone()),
required(3, "ts", Types.TimestampType.withoutZone())
);
Expression expr = and(
and(equal("date", 10L), equal("tsTz", ChronoUnit.MICROS.between(epoch, tsTzPredicate))),
equal("ts", ChronoUnit.MICROS.between(epoch, tsPredicate))
);
Expression boundFilter = Binder.bind(schema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.startAnd()
.equals("`date`", Type.DATE, Date.valueOf(LocalDate.parse("1970-01-11", DateTimeFormatter.ISO_LOCAL_DATE)))
.equals("`tsTz`", Type.TIMESTAMP, Timestamp.from(tsTzPredicate.toInstant()))
.equals("`ts`", Type.TIMESTAMP, Timestamp.from(tsPredicate.toInstant()))
.end()
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema));
Assert.assertEquals(expected.toString(), actual.toString());
}
} finally {
TimeZone.setDefault(currentTz);
}
}
@Test
public void testUnsupportedTypes() {
Schema schema = new Schema(
required(1, "binary", Types.BinaryType.get()),
required(2, "fixed", Types.FixedType.ofLength(5)),
required(3, "uuid", Types.UUIDType.get()),
// use optional fields for performing isNull checks because Iceberg itself resolves them for required fields
optional(4, "struct", Types.StructType.of(
required(5, "long", Types.LongType.get())
)),
optional(6, "list", Types.ListType.ofRequired(7, Types.LongType.get())),
optional(8, "map", Types.MapType.ofRequired(9, 10, Types.LongType.get(), Types.LongType.get()))
);
// all operations for these types should resolve to YES_NO_NULL
Expression expr = and(
and(
and(equal("binary", ByteBuffer.allocate(10)), notEqual("fixed", ByteBuffer.allocate(5))),
and(greaterThan("uuid", UUID.fromString("1-2-3-4-5")), isNull("struct"))
),
and(notNull("list"), isNull("map"))
);
Expression boundFilter = Binder.bind(schema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.literal(TruthValue.YES_NO_NULL)
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema));
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testNestedPrimitives() {
Schema schema = new Schema(
optional(1, "struct", Types.StructType.of(
required(2, "long", Types.LongType.get()),
required(11, "float", Types.FloatType.get())
)),
optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())),
optional(5, "map", Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.DoubleType.get())),
optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of(
required(10, "long", Types.LongType.get()))))
);
Expression expr = and(
and(equal("struct.long", 1), equal("list.element", 2)),
and(equal("map.key", 3), equal("listOfStruct.long", 4)),
and(isNaN("map.value"), notNaN("struct.float"))
);
Expression boundFilter = Binder.bind(schema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.startAnd()
.equals("`struct`.`long`", Type.LONG, 1L)
.equals("`list`.`_elem`", Type.LONG, 2L)
.equals("`map`.`_key`", Type.LONG, 3L)
.equals("`listOfStruct`.`_elem`.`long`", Type.LONG, 4L)
.equals("`map`.`_value`", Type.FLOAT, Double.NaN)
.startOr()
.isNull("`struct`.`float`", Type.FLOAT)
.startNot().equals("`struct`.`float`", Type.FLOAT, Double.NaN)
.end() // not
.end() // or
.end() // and
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema));
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testSpecialCharacters() {
Schema schema = new Schema(
required(1, "col.with.dots", Types.StructType.of(
required(2, "inner.col.with.dots", Types.LongType.get())
)),
required(3, "colW!th$peci@lCh@rs", Types.LongType.get()),
required(4, "colWith`Quotes`", Types.LongType.get())
);
Expression expr = and(
equal("col.with.dots.inner.col.with.dots", 1),
and(equal("colW!th$peci@lCh@rs", 2), equal("colWith`Quotes`", 3))
);
Expression boundFilter = Binder.bind(schema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.startAnd()
.equals("`col.with.dots`.`inner.col.with.dots`", Type.LONG, 1L)
.equals("`colW!th$peci@lCh@rs`", Type.LONG, 2L)
.equals("`colWith``Quotes```", Type.LONG, 3L)
.end()
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, ORCSchemaUtil.convert(schema));
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testEvolvedSchema() {
Schema fileSchema = new Schema(
required(1, "int", Types.IntegerType.get()),
optional(2, "long_to_be_dropped", Types.LongType.get())
);
Schema evolvedSchema = new Schema(
required(1, "int_renamed", Types.IntegerType.get()),
optional(3, "float_added", Types.FloatType.get())
);
TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(evolvedSchema, ORCSchemaUtil.convert(fileSchema));
Expression expr = equal("int_renamed", 1);
Expression boundFilter = Binder.bind(evolvedSchema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.equals("`int`", Type.LONG, 1L)
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
// for columns not in the file, buildOrcProjection will append field names with _r<ID>
// this will be passed down to ORC, but ORC will handle such cases and return a TruthValue during evaluation
expr = equal("float_added", 1);
boundFilter = Binder.bind(evolvedSchema.asStruct(), expr, true);
expected = SearchArgumentFactory.newBuilder()
.equals("`float_added_r3`", Type.FLOAT, 1.0)
.build();
actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testOriginalSchemaNameMapping() {
Schema originalSchema = new Schema(
required(1, "int", Types.IntegerType.get()),
optional(2, "long", Types.LongType.get())
);
TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
NameMapping nameMapping = MappingUtil.create(originalSchema);
TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(originalSchema,
ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
Expression expr = and(equal("int", 1), equal("long", 1));
Expression boundFilter = Binder.bind(originalSchema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.equals("`int`", Type.LONG, 1L)
.equals("`long`", Type.LONG, 1L)
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testModifiedSimpleSchemaNameMapping() {
Schema originalSchema = new Schema(
required(1, "int", Types.IntegerType.get()),
optional(2, "long_to_be_dropped", Types.LongType.get())
);
Schema mappingSchema = new Schema(
required(1, "int", Types.IntegerType.get()),
optional(3, "new_float_field", Types.FloatType.get())
);
TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
NameMapping nameMapping = MappingUtil.create(mappingSchema);
TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema,
ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
Expression expr = equal("int", 1);
Expression boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.equals("`int`", Type.LONG, 1L)
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
// for columns not in the file, buildOrcProjection will append field names with _r<ID>
// this will be passed down to ORC, but ORC will handle such cases and return a TruthValue during evaluation
expr = equal("new_float_field", 1);
boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true);
expected = SearchArgumentFactory.newBuilder()
.equals("`new_float_field_r3`", Type.FLOAT, 1.0)
.build();
actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
}
@Test
public void testModifiedComplexSchemaNameMapping() {
Schema originalSchema = new Schema(
optional(1, "struct", Types.StructType.of(
required(2, "long", Types.LongType.get())
)),
optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())),
optional(5, "map", Types.MapType.ofRequired(6, 7, Types.LongType.get(), Types.LongType.get())),
optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of(
required(10, "long", Types.LongType.get())))),
optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of(
required(13, "name", Types.StringType.get()),
required(14, "birth_date", Types.DateType.get()))))
);
Schema mappingSchema = new Schema(
optional(1, "struct", Types.StructType.of(
required(2, "int", Types.LongType.get())
)),
optional(3, "list", Types.ListType.ofRequired(4, Types.LongType.get())),
optional(5, "newMap", Types.MapType.ofRequired(6, 7, Types.StringType.get(), Types.LongType.get())),
optional(8, "listOfStruct", Types.ListType.ofRequired(9, Types.StructType.of(
required(10, "newLong", Types.LongType.get())))),
optional(11, "listOfPeople", Types.ListType.ofRequired(12, Types.StructType.of(
required(13, "name", Types.StringType.get()),
required(14, "age", Types.IntegerType.get()))))
);
TypeDescription orcSchemaWithoutIds = ORCSchemaUtil.removeIds(ORCSchemaUtil.convert(originalSchema));
NameMapping nameMapping = MappingUtil.create(mappingSchema);
TypeDescription readSchema = ORCSchemaUtil.buildOrcProjection(mappingSchema,
ORCSchemaUtil.applyNameMapping(orcSchemaWithoutIds, nameMapping));
Expression expr = and(
and(
equal("struct.int", 1), and(
lessThanOrEqual("list.element", 5),
equal("newMap.key", "country")
),
and(
equal("listOfStruct.newLong", 100L),
notEqual("listOfPeople.name", "Bob")
)
),
lessThan("listOfPeople.age", 30)
);
Expression boundFilter = Binder.bind(mappingSchema.asStruct(), expr, true);
SearchArgument expected = SearchArgumentFactory.newBuilder()
.startAnd()
// Drops struct.long
.equals("`struct`.`int_r2`", Type.LONG, 1L)
.lessThanEquals("`list`.`_elem`", Type.LONG, 5L)
// Drops map
.equals("`newMap_r5`.`_key`", Type.STRING, "country")
// Drops listOfStruct.long
.equals("`listOfStruct`.`_elem`.`newLong_r10`", Type.LONG, 100L)
.startOr()
.isNull("`listOfPeople`.`_elem`.`name`", Type.STRING)
.startNot().equals("`listOfPeople`.`_elem`.`name`", Type.STRING, "Bob").end()
.end()
.lessThan("`listOfPeople`.`_elem`.`age_r14`", Type.LONG, 30L)
.end()
.build();
SearchArgument actual = ExpressionToSearchArgument.convert(boundFilter, readSchema);
Assert.assertEquals(expected.toString(), actual.toString());
}
}