blob: 7b15099105c0a113770bc427487560c99fab6225 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint:disable=redefined-outer-name
from typing import Any
import pytest
from pyiceberg.conversions import to_bytes
from pyiceberg.expressions import (
And,
EqualTo,
GreaterThan,
GreaterThanOrEqual,
In,
IsNaN,
IsNull,
LessThan,
LessThanOrEqual,
Not,
NotEqualTo,
NotIn,
NotNaN,
NotNull,
NotStartsWith,
Or,
StartsWith,
)
from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator
from pyiceberg.manifest import DataFile, FileFormat
from pyiceberg.schema import Schema
from pyiceberg.typedef import Record
from pyiceberg.types import (
DoubleType,
FloatType,
IcebergType,
IntegerType,
NestedField,
PrimitiveType,
StringType,
)
INT_MIN_VALUE = 30
INT_MAX_VALUE = 79
def _to_byte_buffer(field_type: IcebergType, val: Any) -> bytes:
if not isinstance(field_type, PrimitiveType):
raise ValueError(f"Expected a PrimitiveType, got: {type(field_type)}")
return to_bytes(field_type, val)
INT_MIN = _to_byte_buffer(IntegerType(), INT_MIN_VALUE)
INT_MAX = _to_byte_buffer(IntegerType(), INT_MAX_VALUE)
STRING_MIN = _to_byte_buffer(StringType(), "a")
STRING_MAX = _to_byte_buffer(StringType(), "z")
@pytest.fixture
def schema_data_file() -> Schema:
return Schema(
NestedField(1, "id", IntegerType(), required=True),
NestedField(2, "no_stats", IntegerType(), required=False),
NestedField(3, "required", StringType(), required=True),
NestedField(4, "all_nulls", StringType(), required=False),
NestedField(5, "some_nulls", StringType(), required=False),
NestedField(6, "no_nulls", StringType(), required=False),
NestedField(7, "all_nans", DoubleType(), required=False),
NestedField(8, "some_nans", FloatType(), required=False),
NestedField(9, "no_nans", FloatType(), required=False),
NestedField(10, "all_nulls_double", DoubleType(), required=False),
NestedField(11, "all_nans_v1_stats", FloatType(), required=False),
NestedField(12, "nan_and_null_only", DoubleType(), required=False),
NestedField(13, "no_nan_stats", DoubleType(), required=False),
NestedField(14, "some_empty", StringType(), required=False),
)
@pytest.fixture
def data_file() -> DataFile:
return DataFile.from_args(
file_path="file_1.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={
4: 50,
5: 50,
6: 50,
7: 50,
8: 50,
9: 50,
10: 50,
11: 50,
12: 50,
13: 50,
14: 50,
},
null_value_counts={4: 50, 5: 10, 6: 0, 10: 50, 11: 0, 12: 1, 14: 8},
nan_value_counts={
7: 50,
8: 10,
9: 0,
},
lower_bounds={
1: to_bytes(IntegerType(), INT_MIN_VALUE),
11: to_bytes(FloatType(), float("nan")),
12: to_bytes(DoubleType(), float("nan")),
14: to_bytes(StringType(), ""),
},
upper_bounds={
1: to_bytes(IntegerType(), INT_MAX_VALUE),
11: to_bytes(FloatType(), float("nan")),
12: to_bytes(DoubleType(), float("nan")),
14: to_bytes(StringType(), "房东整租霍营小区二层两居室"),
},
)
@pytest.fixture
def data_file_2() -> DataFile:
return DataFile.from_args(
file_path="file_2.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={3: 20},
null_value_counts={3: 2},
nan_value_counts=None,
lower_bounds={3: to_bytes(StringType(), "aa")},
upper_bounds={3: to_bytes(StringType(), "dC")},
)
@pytest.fixture
def data_file_3() -> DataFile:
return DataFile.from_args(
file_path="file_3.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={3: 20},
null_value_counts={3: 2},
nan_value_counts=None,
lower_bounds={3: to_bytes(StringType(), "1str1")},
upper_bounds={3: to_bytes(StringType(), "3str3")},
)
@pytest.fixture
def data_file_4() -> DataFile:
return DataFile.from_args(
file_path="file_4.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={3: 20},
null_value_counts={3: 2},
nan_value_counts=None,
lower_bounds={3: to_bytes(StringType(), "abc")},
upper_bounds={3: to_bytes(StringType(), "イロハニホヘト")},
)
def test_all_null(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("all_nulls")).eval(data_file)
assert not should_read, "Should skip: no non-null value in all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("all_nulls", "a")).eval(data_file)
assert not should_read, "Should skip: lessThan on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("all_nulls", "a")).eval(data_file)
assert not should_read, "Should skip: lessThanOrEqual on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("all_nulls", "a")).eval(data_file)
assert not should_read, "Should skip: greaterThan on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("all_nulls", "a")).eval(data_file)
assert not should_read, "Should skip: greaterThanOrEqual on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("all_nulls", "a")).eval(data_file)
assert not should_read, "Should skip: equal on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("some_nulls")).eval(data_file)
assert should_read, "Should read: column with some nulls contains a non-null value"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("no_nulls")).eval(data_file)
assert should_read, "Should read: non-null column contains a non-null value"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("all_nulls", "asad")).eval(data_file)
assert not should_read, "Should skip: startsWith on all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("all_nulls", "asad")).eval(data_file)
assert should_read, "Should read: notStartsWith on all null column"
def test_no_nulls(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("all_nulls")).eval(data_file)
assert should_read, "Should read: at least one null value in all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("some_nulls")).eval(data_file)
assert should_read, "Should read: column with some nulls contains a null value"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("no_nulls")).eval(data_file)
assert not should_read, "Should skip: non-null column contains no null values"
def test_is_nan(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nans")).eval(data_file)
assert should_read, "Should read: at least one nan value in all nan column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("some_nans")).eval(data_file)
assert should_read, "Should read: at least one nan value in some nan column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("no_nans")).eval(data_file)
assert not should_read, "Should skip: no-nans column contains no nan values"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nulls_double")).eval(data_file)
assert not should_read, "Should skip: all-null column doesn't contain nan value"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("no_nan_stats")).eval(data_file)
assert should_read, "Should read: no guarantee on if contains nan value without nan stats"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nans_v1_stats")).eval(data_file)
assert should_read, "Should read: at least one nan value in all nan column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("nan_and_null_only")).eval(data_file)
assert should_read, "Should read: at least one nan value in nan and nulls only column"
def test_not_nan(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nans")).eval(data_file)
assert not should_read, "Should skip: column with all nans will not contain non-nan"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("some_nans")).eval(data_file)
assert should_read, "Should read: at least one non-nan value in some nan column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("no_nans")).eval(data_file)
assert should_read, "Should read: at least one non-nan value in no nan column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nulls_double")).eval(data_file)
assert should_read, "Should read: at least one non-nan value in all null column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("no_nan_stats")).eval(data_file)
assert should_read, "Should read: no guarantee on if contains nan value without nan stats"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nans_v1_stats")).eval(data_file)
assert should_read, "Should read: no guarantee on if contains nan value without nan stats"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("nan_and_null_only")).eval(data_file)
assert should_read, "Should read: at least one null value in nan and nulls only column"
def test_required_column(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("required")).eval(data_file)
assert should_read, "Should read: required columns are always non-null"
should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("required")).eval(data_file)
assert not should_read, "Should skip: required columns are always non-null"
def test_missing_column(schema_data_file: Schema, data_file: DataFile) -> None:
with pytest.raises(ValueError) as exc_info:
_ = _InclusiveMetricsEvaluator(schema_data_file, LessThan("missing", 22)).eval(data_file)
assert str(exc_info.value) == "Could not find field with name missing, case_sensitive=True"
def test_missing_stats() -> None:
no_stats_schema = Schema(
NestedField(2, "no_stats", DoubleType(), required=False),
)
no_stats_file = DataFile.from_args(
file_path="file_1.parquet",
file_format=FileFormat.PARQUET,
partition=Record(),
record_count=50,
value_counts=None,
null_value_counts=None,
nan_value_counts=None,
lower_bounds=None,
upper_bounds=None,
)
expressions = [
LessThan("no_stats", 5),
LessThanOrEqual("no_stats", 30),
EqualTo("no_stats", 70),
GreaterThan("no_stats", 78),
GreaterThanOrEqual("no_stats", 90),
NotEqualTo("no_stats", 101),
IsNull("no_stats"),
NotNull("no_stats"),
IsNaN("no_stats"),
NotNaN("no_stats"),
]
for expression in expressions:
should_read = _InclusiveMetricsEvaluator(no_stats_schema, expression).eval(no_stats_file)
assert should_read, f"Should read when stats are missing for: {expression}"
def test_zero_record_file_stats(schema_data_file: Schema) -> None:
zero_record_data_file = DataFile.from_args(
file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition=Record(), record_count=0
)
expressions = [
LessThan("no_stats", 5),
LessThanOrEqual("no_stats", 30),
EqualTo("no_stats", 70),
GreaterThan("no_stats", 78),
GreaterThanOrEqual("no_stats", 90),
NotEqualTo("no_stats", 101),
IsNull("no_stats"),
NotNull("no_stats"),
IsNaN("no_stats"),
NotNaN("no_stats"),
]
for expression in expressions:
should_read = _InclusiveMetricsEvaluator(schema_data_file, expression).eval(zero_record_data_file)
assert not should_read, f"Should skip a datafile without records: {expression}"
def test_not(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(LessThan("id", INT_MIN_VALUE - 25))).eval(data_file)
assert should_read, "Should read: not(false)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(GreaterThan("id", INT_MIN_VALUE - 25))).eval(data_file)
assert not should_read, "Should skip: not(true)"
def test_and(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(
schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
).eval(data_file)
assert not should_read, "Should skip: and(false, true)"
should_read = _InclusiveMetricsEvaluator(
schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE + 1))
).eval(data_file)
assert not should_read, "Should skip: and(false, false)"
should_read = _InclusiveMetricsEvaluator(
schema_data_file, And(GreaterThan("id", INT_MIN_VALUE - 25), LessThanOrEqual("id", INT_MIN_VALUE))
).eval(data_file)
assert should_read, "Should read: and(true, true)"
def test_or(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(
schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE + 1))
).eval(data_file)
assert not should_read, "Should skip: or(false, false)"
should_read = _InclusiveMetricsEvaluator(
schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE - 19))
).eval(data_file)
assert should_read, "Should read: or(false, true)"
def test_integer_lt(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE - 25)).eval(data_file)
assert not should_read, "Should not read: id range below lower bound (5 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE)).eval(data_file)
assert not should_read, "Should not read: id range below lower bound (30 is not < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE + 1)).eval(data_file)
assert should_read, "Should read: one possible id"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MAX_VALUE)).eval(data_file)
assert should_read, "Should read: may possible ids"
def test_integer_lt_eq(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE - 25)).eval(data_file)
assert not should_read, "Should not read: id range below lower bound (5 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(data_file)
assert not should_read, "Should not read: id range below lower bound (30 is not < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE)).eval(data_file)
assert should_read, "Should read: one possible id"
should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MAX_VALUE)).eval(data_file)
assert should_read, "Should read: may possible ids"
def test_integer_gt(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE + 6)).eval(data_file)
assert not should_read, "Should not read: id range above upper bound (85 < 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE)).eval(data_file)
assert not should_read, "Should not read: id range above upper bound (79 is not > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MIN_VALUE - 1)).eval(data_file)
assert should_read, "Should read: one possible id"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE - 4)).eval(data_file)
assert should_read, "Should read: may possible ids"
def test_integer_gt_eq(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE + 6)).eval(data_file)
assert not should_read, "Should not read: id range above upper bound (85 < 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(data_file)
assert not should_read, "Should not read: id range above upper bound (80 > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE)).eval(data_file)
assert should_read, "Should read: one possible id"
should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE - 4)).eval(data_file)
assert should_read, "Should read: may possible ids"
def test_integer_eq(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE - 25)).eval(data_file)
assert not should_read, "Should not read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE - 1)).eval(data_file)
assert not should_read, "Should not read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE)).eval(data_file)
assert should_read, "Should read: id equal to lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE - 4)).eval(data_file)
assert should_read, "Should read: id between lower and upper bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE)).eval(data_file)
assert should_read, "Should read: id equal to upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE + 1)).eval(data_file)
assert not should_read, "Should not read: id above upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE + 6)).eval(data_file)
assert not should_read, "Should not read: id above upper bound"
def test_integer_not_eq(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE - 25)).eval(data_file)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE - 1)).eval(data_file)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE)).eval(data_file)
assert should_read, "Should read: id equal to lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE - 4)).eval(data_file)
assert should_read, "Should read: id between lower and upper bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE)).eval(data_file)
assert should_read, "Should read: id equal to upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE + 1)).eval(data_file)
assert should_read, "Should read: id above upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE + 6)).eval(data_file)
assert should_read, "Should read: id above upper bound"
def test_integer_not_eq_rewritten(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE - 25))).eval(data_file)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE - 1))).eval(data_file)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE))).eval(data_file)
assert should_read, "Should read: id equal to lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE - 4))).eval(data_file)
assert should_read, "Should read: id between lower and upper bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE))).eval(data_file)
assert should_read, "Should read: id equal to upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE + 1))).eval(data_file)
assert should_read, "Should read: id above upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE + 6))).eval(data_file)
assert should_read, "Should read: id above upper bound"
def test_integer_case_insensitive_not_eq_rewritten(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE - 25)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE - 1)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id below lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id equal to lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE - 4)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id between lower and upper bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id equal to upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE + 1)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id above upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE + 6)), case_sensitive=False).eval(
data_file
)
assert should_read, "Should read: id above upper bound"
def test_missing_column_case_sensitive(schema_data_file: Schema, data_file: DataFile) -> None:
with pytest.raises(ValueError) as exc_info:
_ = _InclusiveMetricsEvaluator(schema_data_file, LessThan("ID", 22), case_sensitive=True).eval(data_file)
assert str(exc_info.value) == "Could not find field with name ID, case_sensitive=True"
def test_integer_in(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(data_file)
assert not should_read, "Should not read: id below lower bound (5 < 30, 6 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 2, INT_MIN_VALUE - 1})).eval(data_file)
assert not should_read, "Should not read: id below lower bound (28 < 30, 29 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(data_file)
assert should_read, "Should read: id equal to lower bound (30 == 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(data_file)
assert should_read, "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(data_file)
assert should_read, "Should read: id equal to upper bound (79 == 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(data_file)
assert not should_read, "Should not read: id above upper bound (80 > 79, 81 > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE + 6, INT_MAX_VALUE + 7})).eval(data_file)
assert not should_read, "Should not read: id above upper bound (85 > 79, 86 > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("all_nulls", {"abc", "def"})).eval(data_file)
assert not should_read, "Should skip: in on all nulls column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("some_nulls", {"abc", "def"})).eval(data_file)
assert should_read, "Should read: in on some nulls column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("no_nulls", {"abc", "def"})).eval(data_file)
assert should_read, "Should read: in on no nulls column"
ids = list(range(400))
should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", ids)).eval(data_file)
assert should_read, "Should read: large in expression"
def test_integer_not_in(schema_data_file: Schema, data_file: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(
data_file
)
assert should_read, "Should read: id below lower bound (5 < 30, 6 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 2, INT_MIN_VALUE - 1})).eval(
data_file
)
assert should_read, "Should read: id below lower bound (28 < 30, 29 < 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(data_file)
assert should_read, "Should read: id equal to lower bound (30 == 30)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
data_file
)
assert should_read, "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(data_file)
assert should_read, "Should read: id equal to upper bound (79 == 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
data_file
)
assert should_read, "Should read: id above upper bound (80 > 79, 81 > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE + 6, INT_MAX_VALUE + 7})).eval(
data_file
)
assert should_read, "Should read: id above upper bound (85 > 79, 86 > 79)"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("all_nulls", {"abc", "def"})).eval(data_file)
assert should_read, "Should read: notIn on all nulls column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("some_nulls", {"abc", "def"})).eval(data_file)
assert should_read, "Should read: in on some nulls column"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("no_nulls", {"abc", "def"})).eval(data_file)
assert should_read, "Should read: in on no nulls column"
@pytest.fixture
def schema_data_file_nan() -> Schema:
return Schema(
NestedField(1, "all_nan", DoubleType(), required=True),
NestedField(2, "max_nan", DoubleType(), required=True),
NestedField(3, "min_max_nan", FloatType(), required=False),
NestedField(4, "all_nan_null_bounds", DoubleType(), required=True),
NestedField(5, "some_nan_correct_bounds", FloatType(), required=False),
)
@pytest.fixture
def data_file_nan() -> DataFile:
return DataFile.from_args(
file_path="file.avro",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
column_sizes={
1: 10,
2: 10,
3: 10,
4: 10,
5: 10,
},
value_counts={
1: 10,
2: 10,
3: 10,
4: 10,
5: 10,
},
null_value_counts={
1: 0,
2: 0,
3: 0,
4: 0,
5: 0,
},
nan_value_counts={1: 10, 4: 10, 5: 5},
lower_bounds={
1: to_bytes(DoubleType(), float("nan")),
2: to_bytes(DoubleType(), 7),
3: to_bytes(FloatType(), float("nan")),
5: to_bytes(FloatType(), 7),
},
upper_bounds={
1: to_bytes(DoubleType(), float("nan")),
2: to_bytes(DoubleType(), float("nan")),
3: to_bytes(FloatType(), float("nan")),
5: to_bytes(FloatType(), 22),
},
)
def test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
for operator in [LessThan, LessThanOrEqual]:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: 1 is smaller than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: 10 is larger than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: 1 is smaller than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( # type: ignore[arg-type]
data_file_nan
)
assert should_read, "Should match: 10 larger than lower bound"
def test_inclusive_metrics_evaluator_greater_than_and_greater_than_equal(
schema_data_file_nan: Schema, data_file_nan: DataFile
) -> None:
for operator in [GreaterThan, GreaterThanOrEqual]:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: upper bound is larger than 1"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: upper bound is larger than 10"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan) # type: ignore[arg-type]
assert should_read, "Should match: 1 is smaller than upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval( # type: ignore[arg-type]
data_file_nan
)
assert should_read, "Should match: 10 is smaller than upper bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 30)).eval(data_file_nan) # type: ignore[arg-type]
assert not should_read, "Should not match: 30 is greater than upper bound"
def test_inclusive_metrics_evaluator_equals(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan", 1)).eval(data_file_nan)
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("max_nan", 1)).eval(data_file_nan)
assert not should_read, "Should not match: 1 is smaller than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("max_nan", 10)).eval(data_file_nan)
assert should_read, "Should match: 10 is within bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("min_max_nan", 1)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan_null_bounds", 1)).eval(data_file_nan)
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("some_nan_correct_bounds", 1)).eval(data_file_nan)
assert not should_read, "Should not match: 1 is smaller than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("some_nan_correct_bounds", 10)).eval(data_file_nan)
assert should_read, "Should match: 10 is within bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan", 30)).eval(data_file_nan)
assert not should_read, "Should not match: 30 is greater than upper bound"
def test_inclusive_metrics_evaluator_not_equals(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("all_nan", 1)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("max_nan", 10)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("max_nan", 10)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("min_max_nan", 1)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("all_nan_null_bounds", 1)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 1)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 10)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 30)).eval(data_file_nan)
assert should_read, "Should match: no visibility"
def test_inclusive_metrics_evaluator_in(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("all_nan", (1, 10, 30))).eval(data_file_nan)
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("max_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: 10 and 30 are greater than lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("min_max_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("all_nan_null_bounds", (1, 10, 30))).eval(data_file_nan)
assert not should_read, "Should not match: all nan column doesn't contain number"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: 10 within bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
assert not should_read, "Should not match: 1 and 30 not within bounds"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (5, 7))).eval(data_file_nan)
assert should_read, "Should match: overlap with lower bound"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (22, 25))).eval(data_file_nan)
assert should_read, "Should match: overlap with upper bounds"
def test_inclusive_metrics_evaluator_not_in(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("all_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("max_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("max_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("min_max_nan", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("all_nan_null_bounds", (1, 10, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
assert should_read, "Should match: no visibility"
def test_string_starts_with(
schema_data_file: Schema, data_file: DataFile, data_file_2: DataFile, data_file_3: DataFile, data_file_4: DataFile
) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "a")).eval(data_file)
assert should_read, "Should read: no stats"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "a")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aa")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aaa")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "1s")).eval(data_file_3)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "1str1x")).eval(data_file_3)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "ff")).eval(data_file_4)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aB")).eval(data_file_2)
assert not should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "dWX")).eval(data_file_2)
assert not should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "5")).eval(data_file_3)
assert not should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "3str3x")).eval(data_file_3)
assert not should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("some_empty", "房东整租霍")).eval(data_file)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("all_nulls", "")).eval(data_file)
assert not should_read, "Should not read: range doesn't match"
# above_max = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();
# should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", above_max)).eval(data_file_4)
# assert not should_read, "Should not read: range doesn't match"
def test_string_not_starts_with(
schema_data_file: Schema, data_file: DataFile, data_file_2: DataFile, data_file_3: DataFile, data_file_4: DataFile
) -> None:
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "a")).eval(data_file)
assert should_read, "Should read: no stats"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "a")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aa")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aaa")).eval(data_file_2)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "1s")).eval(data_file_3)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "1str1x")).eval(data_file_3)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "ff")).eval(data_file_4)
assert should_read, "Should read: range matches"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aB")).eval(data_file_2)
assert should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "dWX")).eval(data_file_2)
assert should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "5")).eval(data_file_3)
assert should_read, "Should not read: range doesn't match"
should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "3str3x")).eval(data_file_3)
assert should_read, "Should not read: range doesn't match"
# above_max = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();
# should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", above_max)).eval(data_file_4)
# assert should_read, "Should not read: range doesn't match"
@pytest.fixture
def strict_data_file_schema() -> Schema:
return Schema(
NestedField(1, "id", IntegerType(), required=True),
NestedField(2, "no_stats", IntegerType(), required=False),
NestedField(3, "required", StringType(), required=True),
NestedField(4, "all_nulls", StringType(), required=False),
NestedField(5, "some_nulls", StringType(), required=False),
NestedField(6, "no_nulls", StringType(), required=False),
NestedField(7, "always_5", IntegerType(), required=False),
NestedField(8, "all_nans", DoubleType(), required=False),
NestedField(9, "some_nans", FloatType(), required=False),
NestedField(10, "no_nans", FloatType(), required=False),
NestedField(11, "all_nulls_double", DoubleType(), required=False),
NestedField(12, "all_nans_v1_stats", FloatType(), required=False),
NestedField(13, "nan_and_null_only", DoubleType(), required=False),
NestedField(14, "no_nan_stats", DoubleType(), required=False),
)
@pytest.fixture
def strict_data_file_1() -> DataFile:
return DataFile.from_args(
file_path="file_1.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={
4: 50,
5: 50,
6: 50,
8: 50,
9: 50,
10: 50,
11: 50,
12: 50,
13: 50,
14: 50,
},
null_value_counts={4: 50, 5: 10, 6: 0, 11: 50, 12: 0, 13: 1},
nan_value_counts={
8: 50,
9: 10,
10: 0,
},
lower_bounds={
1: to_bytes(IntegerType(), INT_MIN_VALUE),
7: to_bytes(IntegerType(), 5),
12: to_bytes(FloatType(), float("nan")),
13: to_bytes(DoubleType(), float("nan")),
},
upper_bounds={
1: to_bytes(IntegerType(), INT_MAX_VALUE),
7: to_bytes(IntegerType(), 5),
12: to_bytes(FloatType(), float("nan")),
14: to_bytes(DoubleType(), float("nan")),
},
)
@pytest.fixture
def strict_data_file_2() -> DataFile:
return DataFile.from_args(
file_path="file_2.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={
4: 50,
5: 50,
6: 50,
8: 50,
},
null_value_counts={4: 50, 5: 10, 6: 0},
nan_value_counts=None,
lower_bounds={
5: to_bytes(StringType(), "bbb"),
},
upper_bounds={
5: to_bytes(StringType(), "eee"),
},
)
@pytest.fixture
def strict_data_file_3() -> DataFile:
return DataFile.from_args(
file_path="file_3.parquet",
file_format=FileFormat.PARQUET,
partition={},
record_count=50,
file_size_in_bytes=3,
value_counts={
4: 50,
5: 50,
6: 50,
},
null_value_counts={4: 50, 5: 10, 6: 0},
nan_value_counts=None,
lower_bounds={
5: to_bytes(StringType(), "bbb"),
},
upper_bounds={
5: to_bytes(StringType(), "eee"),
},
)
def test_strict_all_nulls(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("all_nulls")).eval(strict_data_file_1)
assert not should_read, "Should not match: no non-null value in all null column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("some_nulls")).eval(strict_data_file_1)
assert not should_read, "Should not match: column with some nulls contains a non-null value"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("no_nulls")).eval(strict_data_file_1)
assert should_read, "Should match: non-null column contains no null values"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("all_nulls", "a")).eval(strict_data_file_1)
assert should_read, "Should match: notEqual on all nulls column"
def test_strict_no_nulls(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("all_nulls")).eval(strict_data_file_1)
assert should_read, "Should match: all values are null"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("some_nulls")).eval(strict_data_file_1)
assert not should_read, "Should not match: not all values are null"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("no_nulls")).eval(strict_data_file_1)
assert not should_read, "Should not match: no values are null"
def test_strict_some_nulls(strict_data_file_schema: Schema, strict_data_file_2: DataFile, strict_data_file_3: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("some_nulls", "ggg")).eval(strict_data_file_2)
assert not should_read, "Should not match: lessThan on some nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("some_nulls", "ggg")).eval(strict_data_file_2)
assert not should_read, "Should not match: lessThanOrEqual on some nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("some_nulls", "aaa")).eval(strict_data_file_2)
assert not should_read, "Should not match: greaterThan on some nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("some_nulls", "bbb")).eval(
strict_data_file_2
)
assert not should_read, "Should not match: greaterThanOrEqual on some nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("some_nulls", "bbb")).eval(strict_data_file_3)
assert not should_read, "Should not match: equal on some nulls column"
def test_strict_is_nan(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nans")).eval(strict_data_file_1)
assert should_read, "Should match: all values are nan"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("some_nans")).eval(strict_data_file_1)
assert not should_read, "Should not match: at least one non-nan value in some nan column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("no_nans")).eval(strict_data_file_1)
assert not should_read, "Should not match: at least one non-nan value in no nan column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nulls_double")).eval(strict_data_file_1)
assert not should_read, "Should not match: at least one non-nan value in all null column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("no_nan_stats")).eval(strict_data_file_1)
assert not should_read, "Should not match: cannot determine without nan stats"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nans_v1_stats")).eval(strict_data_file_1)
assert not should_read, "Should not match: cannot determine without nan stats"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("nan_and_null_only")).eval(strict_data_file_1)
assert not should_read, "Should not match: null values are not nan"
def test_strict_not_nan(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nans")).eval(strict_data_file_1)
assert not should_read, "Should not match: all values are nan"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("some_nans")).eval(strict_data_file_1)
assert not should_read, "Should not match: at least one nan value in some nan column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("no_nans")).eval(strict_data_file_1)
assert should_read, "Should match: no value is nan"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nulls_double")).eval(strict_data_file_1)
assert should_read, "Should match: no nan value in all null column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("no_nan_stats")).eval(strict_data_file_1)
assert not should_read, "Should not match: cannot determine without nan stats"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nans_v1_stats")).eval(strict_data_file_1)
assert not should_read, "Should not match: all values are nan"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("nan_and_null_only")).eval(strict_data_file_1)
assert not should_read, "Should not match: null values are not nan"
def test_strict_required_column(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("required")).eval(strict_data_file_1)
assert should_read, "Should match: required columns are always non-null"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("required")).eval(strict_data_file_1)
assert not should_read, "Should not match: required columns never contain null"
def test_strict_missing_column(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
with pytest.raises(ValueError) as exc_info:
_ = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("missing")).eval(strict_data_file_1)
assert str(exc_info.value) == "Could not find field with name missing, case_sensitive=True"
def test_strict_missing_stats(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
no_stats_schema = Schema(
NestedField(2, "no_stats", DoubleType(), required=False),
)
no_stats_file = DataFile.from_args(
file_path="file_1.parquet",
file_format=FileFormat.PARQUET,
partition=Record(),
record_count=50,
value_counts=None,
null_value_counts=None,
nan_value_counts=None,
lower_bounds=None,
upper_bounds=None,
)
expressions = [
LessThan("no_stats", 5),
LessThanOrEqual("no_stats", 30),
EqualTo("no_stats", 70),
GreaterThan("no_stats", 78),
GreaterThanOrEqual("no_stats", 90),
NotEqualTo("no_stats", 101),
IsNull("no_stats"),
NotNull("no_stats"),
IsNaN("no_stats"),
NotNaN("no_stats"),
]
for expression in expressions:
should_read = _StrictMetricsEvaluator(no_stats_schema, expression).eval(no_stats_file)
assert not should_read, f"Should never match when stats are missing for expr: {expression}"
def test_strict_zero_record_file_stats(strict_data_file_schema: Schema) -> None:
zero_record_data_file = DataFile.from_args(
file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition=Record(), record_count=0
)
expressions = [
LessThan("no_stats", 5),
LessThanOrEqual("no_stats", 30),
EqualTo("no_stats", 70),
GreaterThan("no_stats", 78),
GreaterThanOrEqual("no_stats", 90),
NotEqualTo("no_stats", 101),
IsNull("no_stats"),
NotNull("no_stats"),
IsNaN("no_stats"),
NotNaN("no_stats"),
]
for expression in expressions:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, expression).eval(zero_record_data_file)
assert should_read, f"Should always match 0-record file: {expression}"
def test_strict_not(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(schema_data_file, Not(LessThan("id", INT_MIN_VALUE - 25))).eval(strict_data_file_1)
assert should_read, "Should not match: not(false)"
should_read = _StrictMetricsEvaluator(schema_data_file, Not(GreaterThan("id", INT_MIN_VALUE - 25))).eval(strict_data_file_1)
assert not should_read, "Should match: not(true)"
def test_strict_and(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(
schema_data_file, And(GreaterThan("id", INT_MIN_VALUE - 25), LessThanOrEqual("id", INT_MIN_VALUE))
).eval(strict_data_file_1)
assert not should_read, "Should not match: range may not overlap data"
should_read = _StrictMetricsEvaluator(
schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
).eval(strict_data_file_1)
assert not should_read, "Should not match: range does not overlap data"
should_read = _StrictMetricsEvaluator(
schema_data_file, And(LessThan("id", INT_MAX_VALUE + 6), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
).eval(strict_data_file_1)
assert should_read, "Should match: range includes all data"
def test_strict_or(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(
schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE + 1))
).eval(strict_data_file_1)
assert not should_read, "Should not match: no matching values"
should_read = _StrictMetricsEvaluator(
schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE - 19))
).eval(strict_data_file_1)
assert not should_read, "Should not match: some values do not match"
should_read = _StrictMetricsEvaluator(
schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE))
).eval(strict_data_file_1)
assert should_read, "Should match: all values match >= 30"
def test_strict_integer_lt(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MIN_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: always false"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MIN_VALUE + 1)).eval(strict_data_file_1)
assert not should_read, "Should not match: 32 and greater not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MAX_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: 79 not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
assert should_read, "Should match: all values in range"
def test_strict_integer_lt_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(
strict_data_file_1
)
assert not should_read, "Should not match: always false"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MIN_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: 31 and greater not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MAX_VALUE)).eval(strict_data_file_1)
assert should_read, "Should match: all values in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MAX_VALUE + 1)).eval(
strict_data_file_1
)
assert should_read, "Should match: all values in range"
def test_strict_integer_gt(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MAX_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: always false"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MAX_VALUE - 1)).eval(strict_data_file_1)
assert not should_read, "Should not match: 77 and less not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MIN_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: 30 not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MIN_VALUE - 1)).eval(strict_data_file_1)
assert should_read, "Should match: all values in range"
def test_strict_integer_gt_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(
strict_data_file_1
)
assert not should_read, "Should not match: no values in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MAX_VALUE)).eval(
strict_data_file_1
)
assert not should_read, "Should not match: 78 and lower are not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MIN_VALUE + 1)).eval(
strict_data_file_1
)
assert not should_read, "Should not match: 30 not in range"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MIN_VALUE)).eval(
strict_data_file_1
)
assert should_read, "Should match: all values in range"
def test_strict_integer_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MIN_VALUE - 25)).eval(strict_data_file_1)
assert not should_read, "Should not match: all values != 5"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MIN_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: some values != 30"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE - 4)).eval(strict_data_file_1)
assert not should_read, "Should not match: some values != 75"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: some values != 79"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
assert not should_read, "Should not match: some values != 80"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("always_5", INT_MIN_VALUE - 25)).eval(
strict_data_file_1
)
assert should_read, "Should match: all values == 5"
def test_strict_integer_not_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE - 25)).eval(strict_data_file_1)
assert should_read, "Should match: no values == 5"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE - 1)).eval(strict_data_file_1)
assert should_read, "Should match: no values == 39"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 30"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE - 4)).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 75"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE)).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 79"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
assert should_read, "Should match: no values == 80"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE + 6)).eval(strict_data_file_1)
assert should_read, "Should read: no values == 85"
def test_strict_integer_not_eq_rewritten(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE - 25))).eval(
strict_data_file_1
)
assert should_read, "Should match: no values == 5"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE - 1))).eval(strict_data_file_1)
assert should_read, "Should match: no values == 39"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE))).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 30"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE - 4))).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 75"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE))).eval(strict_data_file_1)
assert not should_read, "Should not match: some value may be == 79"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE + 1))).eval(strict_data_file_1)
assert should_read, "Should match: no values == 80"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE + 6))).eval(strict_data_file_1)
assert should_read, "Should read: no values == 85"
def test_strict_integer_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: all values != 5 and != 6"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some values != 30 and != 31"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some values != 75 and != 76"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some values != 78 and != 79"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some values != 80 and != 81)"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("always_5", {5, 6})).eval(strict_data_file_1)
assert should_read, "Should match: all values == 5"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("all_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert not should_read, "Should not match: in on all nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert not should_read, "Should not match: in on some nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert not should_read, "Should not match: no_nulls field does not have bounds"
def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
# should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(strict_data_file_1)
# assert should_read, "Should match: all values != 5 and != 6"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some values may be == 30"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some value may be == 75 or == 76"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(
strict_data_file_1
)
assert not should_read, "Should not match: some value may be == 79"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
strict_data_file_1
)
assert should_read, "Should match: no values == 80 or == 81"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("always_5", {5, 6})).eval(strict_data_file_1)
assert not should_read, "Should not match: all values == 5"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("all_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert should_read, "Should match: notIn on all nulls column"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert should_read, "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'"
should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
assert not should_read, "Should not match: no_nulls field does not have bounds"