# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint:disable=redefined-outer-name
from typing import Any

import pytest

from pyiceberg.conversions import to_bytes
from pyiceberg.expressions import (
    And,
    EqualTo,
    GreaterThan,
    GreaterThanOrEqual,
    In,
    IsNaN,
    IsNull,
    LessThan,
    LessThanOrEqual,
    Not,
    NotEqualTo,
    NotIn,
    NotNaN,
    NotNull,
    NotStartsWith,
    Or,
    StartsWith,
)
from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator
from pyiceberg.manifest import DataFile, FileFormat
from pyiceberg.schema import Schema
from pyiceberg.types import (
    DoubleType,
    FloatType,
    IcebergType,
    IntegerType,
    NestedField,
    PrimitiveType,
    StringType,
)

INT_MIN_VALUE = 30
INT_MAX_VALUE = 79


def _to_byte_buffer(field_type: IcebergType, val: Any) -> bytes:
    if not isinstance(field_type, PrimitiveType):
        raise ValueError(f"Expected a PrimitiveType, got: {type(field_type)}")
    return to_bytes(field_type, val)


INT_MIN = _to_byte_buffer(IntegerType(), INT_MIN_VALUE)
INT_MAX = _to_byte_buffer(IntegerType(), INT_MAX_VALUE)

STRING_MIN = _to_byte_buffer(StringType(), "a")
STRING_MAX = _to_byte_buffer(StringType(), "z")


@pytest.fixture
def schema_data_file() -> Schema:
    return Schema(
        NestedField(1, "id", IntegerType(), required=True),
        NestedField(2, "no_stats", IntegerType(), required=False),
        NestedField(3, "required", StringType(), required=True),
        NestedField(4, "all_nulls", StringType(), required=False),
        NestedField(5, "some_nulls", StringType(), required=False),
        NestedField(6, "no_nulls", StringType(), required=False),
        NestedField(7, "all_nans", DoubleType(), required=False),
        NestedField(8, "some_nans", FloatType(), required=False),
        NestedField(9, "no_nans", FloatType(), required=False),
        NestedField(10, "all_nulls_double", DoubleType(), required=False),
        NestedField(11, "all_nans_v1_stats", FloatType(), required=False),
        NestedField(12, "nan_and_null_only", DoubleType(), required=False),
        NestedField(13, "no_nan_stats", DoubleType(), required=False),
        NestedField(14, "some_empty", StringType(), required=False),
    )


@pytest.fixture
def data_file() -> DataFile:
    return DataFile(
        file_path="file_1.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={
            4: 50,
            5: 50,
            6: 50,
            7: 50,
            8: 50,
            9: 50,
            10: 50,
            11: 50,
            12: 50,
            13: 50,
            14: 50,
        },
        null_value_counts={4: 50, 5: 10, 6: 0, 10: 50, 11: 0, 12: 1, 14: 8},
        nan_value_counts={
            7: 50,
            8: 10,
            9: 0,
        },
        lower_bounds={
            1: to_bytes(IntegerType(), INT_MIN_VALUE),
            11: to_bytes(FloatType(), float("nan")),
            12: to_bytes(DoubleType(), float("nan")),
            14: to_bytes(StringType(), ""),
        },
        upper_bounds={
            1: to_bytes(IntegerType(), INT_MAX_VALUE),
            11: to_bytes(FloatType(), float("nan")),
            12: to_bytes(DoubleType(), float("nan")),
            14: to_bytes(StringType(), "房东整租霍营小区二层两居室"),
        },
    )


@pytest.fixture
def data_file_2() -> DataFile:
    return DataFile(
        file_path="file_2.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={3: 20},
        null_value_counts={3: 2},
        nan_value_counts=None,
        lower_bounds={3: to_bytes(StringType(), "aa")},
        upper_bounds={3: to_bytes(StringType(), "dC")},
    )


@pytest.fixture
def data_file_3() -> DataFile:
    return DataFile(
        file_path="file_3.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={3: 20},
        null_value_counts={3: 2},
        nan_value_counts=None,
        lower_bounds={3: to_bytes(StringType(), "1str1")},
        upper_bounds={3: to_bytes(StringType(), "3str3")},
    )


@pytest.fixture
def data_file_4() -> DataFile:
    return DataFile(
        file_path="file_4.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={3: 20},
        null_value_counts={3: 2},
        nan_value_counts=None,
        lower_bounds={3: to_bytes(StringType(), "abc")},
        upper_bounds={3: to_bytes(StringType(), "イロハニホヘト")},
    )


def test_all_null(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("all_nulls")).eval(data_file)
    assert not should_read, "Should skip: no non-null value in all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("all_nulls", "a")).eval(data_file)
    assert not should_read, "Should skip: lessThan on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("all_nulls", "a")).eval(data_file)
    assert not should_read, "Should skip: lessThanOrEqual on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("all_nulls", "a")).eval(data_file)
    assert not should_read, "Should skip: greaterThan on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("all_nulls", "a")).eval(data_file)
    assert not should_read, "Should skip: greaterThanOrEqual on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("all_nulls", "a")).eval(data_file)
    assert not should_read, "Should skip: equal on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("some_nulls")).eval(data_file)
    assert should_read, "Should read: column with some nulls contains a non-null value"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("no_nulls")).eval(data_file)
    assert should_read, "Should read: non-null column contains a non-null value"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("all_nulls", "asad")).eval(data_file)
    assert not should_read, "Should skip: startsWith on all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("all_nulls", "asad")).eval(data_file)
    assert should_read, "Should read: notStartsWith on all null column"


def test_no_nulls(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("all_nulls")).eval(data_file)
    assert should_read, "Should read: at least one null value in all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("some_nulls")).eval(data_file)
    assert should_read, "Should read: column with some nulls contains a null value"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("no_nulls")).eval(data_file)
    assert not should_read, "Should skip: non-null column contains no null values"


def test_is_nan(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nans")).eval(data_file)
    assert should_read, "Should read: at least one nan value in all nan column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("some_nans")).eval(data_file)
    assert should_read, "Should read: at least one nan value in some nan column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("no_nans")).eval(data_file)
    assert not should_read, "Should skip: no-nans column contains no nan values"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nulls_double")).eval(data_file)
    assert not should_read, "Should skip: all-null column doesn't contain nan value"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("no_nan_stats")).eval(data_file)
    assert should_read, "Should read: no guarantee on if contains nan value without nan stats"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("all_nans_v1_stats")).eval(data_file)
    assert should_read, "Should read: at least one nan value in all nan column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNaN("nan_and_null_only")).eval(data_file)
    assert should_read, "Should read: at least one nan value in nan and nulls only column"


def test_not_nan(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nans")).eval(data_file)
    assert not should_read, "Should skip: column with all nans will not contain non-nan"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("some_nans")).eval(data_file)
    assert should_read, "Should read: at least one non-nan value in some nan column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("no_nans")).eval(data_file)
    assert should_read, "Should read: at least one non-nan value in no nan column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nulls_double")).eval(data_file)
    assert should_read, "Should read: at least one non-nan value in all null column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("no_nan_stats")).eval(data_file)
    assert should_read, "Should read: no guarantee on if contains nan value without nan stats"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("all_nans_v1_stats")).eval(data_file)
    assert should_read, "Should read: no guarantee on if contains nan value without nan stats"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNaN("nan_and_null_only")).eval(data_file)
    assert should_read, "Should read: at least one null value in nan and nulls only column"


def test_required_column(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotNull("required")).eval(data_file)
    assert should_read, "Should read: required columns are always non-null"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, IsNull("required")).eval(data_file)
    assert not should_read, "Should skip: required columns are always non-null"


def test_missing_column(schema_data_file: Schema, data_file: DataFile) -> None:
    with pytest.raises(ValueError) as exc_info:
        _ = _InclusiveMetricsEvaluator(schema_data_file, LessThan("missing", 22)).eval(data_file)

    assert str(exc_info.value) == "Could not find field with name missing, case_sensitive=True"


def test_missing_stats() -> None:
    no_stats_schema = Schema(
        NestedField(2, "no_stats", DoubleType(), required=False),
    )

    no_stats_file = DataFile(
        file_path="file_1.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        value_counts=None,
        null_value_counts=None,
        nan_value_counts=None,
        lower_bounds=None,
        upper_bounds=None,
    )

    expressions = [
        LessThan("no_stats", 5),
        LessThanOrEqual("no_stats", 30),
        EqualTo("no_stats", 70),
        GreaterThan("no_stats", 78),
        GreaterThanOrEqual("no_stats", 90),
        NotEqualTo("no_stats", 101),
        IsNull("no_stats"),
        NotNull("no_stats"),
        IsNaN("no_stats"),
        NotNaN("no_stats"),
    ]

    for expression in expressions:
        should_read = _InclusiveMetricsEvaluator(no_stats_schema, expression).eval(no_stats_file)
        assert should_read, f"Should read when stats are missing for: {expression}"


def test_zero_record_file_stats(schema_data_file: Schema) -> None:
    zero_record_data_file = DataFile(file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition={}, record_count=0)

    expressions = [
        LessThan("no_stats", 5),
        LessThanOrEqual("no_stats", 30),
        EqualTo("no_stats", 70),
        GreaterThan("no_stats", 78),
        GreaterThanOrEqual("no_stats", 90),
        NotEqualTo("no_stats", 101),
        IsNull("no_stats"),
        NotNull("no_stats"),
        IsNaN("no_stats"),
        NotNaN("no_stats"),
    ]

    for expression in expressions:
        should_read = _InclusiveMetricsEvaluator(schema_data_file, expression).eval(zero_record_data_file)
        assert not should_read, f"Should skip a datafile without records: {expression}"


def test_not(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(LessThan("id", INT_MIN_VALUE - 25))).eval(data_file)
    assert should_read, "Should read: not(false)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(GreaterThan("id", INT_MIN_VALUE - 25))).eval(data_file)
    assert not should_read, "Should skip: not(true)"


def test_and(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(
        schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
    ).eval(data_file)
    assert not should_read, "Should skip: and(false, true)"

    should_read = _InclusiveMetricsEvaluator(
        schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE + 1))
    ).eval(data_file)
    assert not should_read, "Should skip: and(false, false)"

    should_read = _InclusiveMetricsEvaluator(
        schema_data_file, And(GreaterThan("id", INT_MIN_VALUE - 25), LessThanOrEqual("id", INT_MIN_VALUE))
    ).eval(data_file)
    assert should_read, "Should read: and(true, true)"


def test_or(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(
        schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE + 1))
    ).eval(data_file)
    assert not should_read, "Should skip: or(false, false)"

    should_read = _InclusiveMetricsEvaluator(
        schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE - 19))
    ).eval(data_file)
    assert should_read, "Should read: or(false, true)"


def test_integer_lt(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE - 25)).eval(data_file)
    assert not should_read, "Should not read: id range below lower bound (5 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE)).eval(data_file)
    assert not should_read, "Should not read: id range below lower bound (30 is not < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MIN_VALUE + 1)).eval(data_file)
    assert should_read, "Should read: one possible id"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThan("id", INT_MAX_VALUE)).eval(data_file)
    assert should_read, "Should read: may possible ids"


def test_integer_lt_eq(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE - 25)).eval(data_file)
    assert not should_read, "Should not read: id range below lower bound (5 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(data_file)
    assert not should_read, "Should not read: id range below lower bound (30 is not < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MIN_VALUE)).eval(data_file)
    assert should_read, "Should read: one possible id"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, LessThanOrEqual("id", INT_MAX_VALUE)).eval(data_file)
    assert should_read, "Should read: may possible ids"


def test_integer_gt(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE + 6)).eval(data_file)
    assert not should_read, "Should not read: id range above upper bound (85 < 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE)).eval(data_file)
    assert not should_read, "Should not read: id range above upper bound (79 is not > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MIN_VALUE - 1)).eval(data_file)
    assert should_read, "Should read: one possible id"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThan("id", INT_MAX_VALUE - 4)).eval(data_file)
    assert should_read, "Should read: may possible ids"


def test_integer_gt_eq(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE + 6)).eval(data_file)
    assert not should_read, "Should not read: id range above upper bound (85 < 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(data_file)
    assert not should_read, "Should not read: id range above upper bound (80 > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE)).eval(data_file)
    assert should_read, "Should read: one possible id"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, GreaterThanOrEqual("id", INT_MAX_VALUE - 4)).eval(data_file)
    assert should_read, "Should read: may possible ids"


def test_integer_eq(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE - 25)).eval(data_file)
    assert not should_read, "Should not read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE - 1)).eval(data_file)
    assert not should_read, "Should not read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MIN_VALUE)).eval(data_file)
    assert should_read, "Should read: id equal to lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE - 4)).eval(data_file)
    assert should_read, "Should read: id between lower and upper bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE)).eval(data_file)
    assert should_read, "Should read: id equal to upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE + 1)).eval(data_file)
    assert not should_read, "Should not read: id above upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, EqualTo("id", INT_MAX_VALUE + 6)).eval(data_file)
    assert not should_read, "Should not read: id above upper bound"


def test_integer_not_eq(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE - 25)).eval(data_file)
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE - 1)).eval(data_file)
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MIN_VALUE)).eval(data_file)
    assert should_read, "Should read: id equal to lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE - 4)).eval(data_file)
    assert should_read, "Should read: id between lower and upper bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE)).eval(data_file)
    assert should_read, "Should read: id equal to upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE + 1)).eval(data_file)
    assert should_read, "Should read: id above upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotEqualTo("id", INT_MAX_VALUE + 6)).eval(data_file)
    assert should_read, "Should read: id above upper bound"


def test_integer_not_eq_rewritten(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE - 25))).eval(data_file)
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE - 1))).eval(data_file)
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MIN_VALUE))).eval(data_file)
    assert should_read, "Should read: id equal to lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE - 4))).eval(data_file)
    assert should_read, "Should read: id between lower and upper bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE))).eval(data_file)
    assert should_read, "Should read: id equal to upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE + 1))).eval(data_file)
    assert should_read, "Should read: id above upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("id", INT_MAX_VALUE + 6))).eval(data_file)
    assert should_read, "Should read: id above upper bound"


def test_integer_case_insensitive_not_eq_rewritten(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE - 25)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE - 1)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id below lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MIN_VALUE)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id equal to lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE - 4)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id between lower and upper bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id equal to upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE + 1)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id above upper bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, Not(EqualTo("ID", INT_MAX_VALUE + 6)), case_sensitive=False).eval(
        data_file
    )
    assert should_read, "Should read: id above upper bound"


def test_missing_column_case_sensitive(schema_data_file: Schema, data_file: DataFile) -> None:
    with pytest.raises(ValueError) as exc_info:
        _ = _InclusiveMetricsEvaluator(schema_data_file, LessThan("ID", 22), case_sensitive=True).eval(data_file)

    assert str(exc_info.value) == "Could not find field with name ID, case_sensitive=True"


def test_integer_in(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(data_file)
    assert not should_read, "Should not read: id below lower bound (5 < 30, 6 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 2, INT_MIN_VALUE - 1})).eval(data_file)
    assert not should_read, "Should not read: id below lower bound (28 < 30, 29 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(data_file)
    assert should_read, "Should read: id equal to lower bound (30 == 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(data_file)
    assert should_read, "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(data_file)
    assert should_read, "Should read: id equal to upper bound (79 == 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(data_file)
    assert not should_read, "Should not read: id above upper bound (80 > 79, 81 > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", {INT_MAX_VALUE + 6, INT_MAX_VALUE + 7})).eval(data_file)
    assert not should_read, "Should not read: id above upper bound (85 > 79, 86 > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("all_nulls", {"abc", "def"})).eval(data_file)
    assert not should_read, "Should skip: in on all nulls column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("some_nulls", {"abc", "def"})).eval(data_file)
    assert should_read, "Should read: in on some nulls column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("no_nulls", {"abc", "def"})).eval(data_file)
    assert should_read, "Should read: in on no nulls column"

    ids = list(range(400))
    should_read = _InclusiveMetricsEvaluator(schema_data_file, In("id", ids)).eval(data_file)
    assert should_read, "Should read: large in expression"


def test_integer_not_in(schema_data_file: Schema, data_file: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(
        data_file
    )
    assert should_read, "Should read: id below lower bound (5 < 30, 6 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 2, INT_MIN_VALUE - 1})).eval(
        data_file
    )
    assert should_read, "Should read: id below lower bound (28 < 30, 29 < 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(data_file)
    assert should_read, "Should read: id equal to lower bound (30 == 30)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
        data_file
    )
    assert should_read, "Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(data_file)
    assert should_read, "Should read: id equal to upper bound (79 == 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
        data_file
    )
    assert should_read, "Should read: id above upper bound (80 > 79, 81 > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("id", {INT_MAX_VALUE + 6, INT_MAX_VALUE + 7})).eval(
        data_file
    )
    assert should_read, "Should read: id above upper bound (85 > 79, 86 > 79)"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("all_nulls", {"abc", "def"})).eval(data_file)
    assert should_read, "Should read: notIn on all nulls column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("some_nulls", {"abc", "def"})).eval(data_file)
    assert should_read, "Should read: in on some nulls column"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotIn("no_nulls", {"abc", "def"})).eval(data_file)
    assert should_read, "Should read: in on no nulls column"


@pytest.fixture
def schema_data_file_nan() -> Schema:
    return Schema(
        NestedField(1, "all_nan", DoubleType(), required=True),
        NestedField(2, "max_nan", DoubleType(), required=True),
        NestedField(3, "min_max_nan", FloatType(), required=False),
        NestedField(4, "all_nan_null_bounds", DoubleType(), required=True),
        NestedField(5, "some_nan_correct_bounds", FloatType(), required=False),
    )


@pytest.fixture
def data_file_nan() -> DataFile:
    return DataFile(
        file_path="file.avro",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        column_sizes={
            1: 10,
            2: 10,
            3: 10,
            4: 10,
            5: 10,
        },
        value_counts={
            1: 10,
            2: 10,
            3: 10,
            4: 10,
            5: 10,
        },
        null_value_counts={
            1: 0,
            2: 0,
            3: 0,
            4: 0,
            5: 0,
        },
        nan_value_counts={1: 10, 4: 10, 5: 5},
        lower_bounds={
            1: to_bytes(DoubleType(), float("nan")),
            2: to_bytes(DoubleType(), 7),
            3: to_bytes(FloatType(), float("nan")),
            5: to_bytes(FloatType(), 7),
        },
        upper_bounds={
            1: to_bytes(DoubleType(), float("nan")),
            2: to_bytes(DoubleType(), float("nan")),
            3: to_bytes(FloatType(), float("nan")),
            5: to_bytes(FloatType(), 22),
        },
    )


def test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
    for operator in [LessThan, LessThanOrEqual]:
        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: all nan column doesn't contain number"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: 1 is smaller than lower bound"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: 10 is larger than lower bound"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: no visibility"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: all nan column doesn't contain number"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: 1 is smaller than lower bound"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval(  # type: ignore[arg-type]
            data_file_nan
        )
        assert should_read, "Should match: 10 larger than lower bound"


def test_inclusive_metrics_evaluator_greater_than_and_greater_than_equal(
    schema_data_file_nan: Schema, data_file_nan: DataFile
) -> None:
    for operator in [GreaterThan, GreaterThanOrEqual]:
        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: all nan column doesn't contain number"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: upper bound is larger than 1"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("max_nan", 10)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: upper bound is larger than 10"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("min_max_nan", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: no visibility"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan_null_bounds", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: all nan column doesn't contain number"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 1)).eval(data_file_nan)  # type: ignore[arg-type]
        assert should_read, "Should match: 1 is smaller than upper bound"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("some_nan_correct_bounds", 10)).eval(  # type: ignore[arg-type]
            data_file_nan
        )
        assert should_read, "Should match: 10 is smaller than upper bound"

        should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 30)).eval(data_file_nan)  # type: ignore[arg-type]
        assert not should_read, "Should not match: 30 is greater than upper bound"


def test_inclusive_metrics_evaluator_equals(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan", 1)).eval(data_file_nan)
    assert not should_read, "Should not match: all nan column doesn't contain number"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("max_nan", 1)).eval(data_file_nan)
    assert not should_read, "Should not match: 1 is smaller than lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("max_nan", 10)).eval(data_file_nan)
    assert should_read, "Should match: 10 is within bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("min_max_nan", 1)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan_null_bounds", 1)).eval(data_file_nan)
    assert not should_read, "Should not match: all nan column doesn't contain number"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("some_nan_correct_bounds", 1)).eval(data_file_nan)
    assert not should_read, "Should not match: 1 is smaller than lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("some_nan_correct_bounds", 10)).eval(data_file_nan)
    assert should_read, "Should match: 10 is within bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, EqualTo("all_nan", 30)).eval(data_file_nan)
    assert not should_read, "Should not match: 30 is greater than upper bound"


def test_inclusive_metrics_evaluator_not_equals(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("all_nan", 1)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("max_nan", 10)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("max_nan", 10)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("min_max_nan", 1)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("all_nan_null_bounds", 1)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 1)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 10)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotEqualTo("some_nan_correct_bounds", 30)).eval(data_file_nan)
    assert should_read, "Should match: no visibility"


def test_inclusive_metrics_evaluator_in(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("all_nan", (1, 10, 30))).eval(data_file_nan)
    assert not should_read, "Should not match: all nan column doesn't contain number"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("max_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: 10 and 30 are greater than lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("min_max_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("all_nan_null_bounds", (1, 10, 30))).eval(data_file_nan)
    assert not should_read, "Should not match: all nan column doesn't contain number"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: 10 within bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
    assert not should_read, "Should not match: 1 and 30 not within bounds"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (5, 7))).eval(data_file_nan)
    assert should_read, "Should match: overlap with lower bound"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, In("some_nan_correct_bounds", (22, 25))).eval(data_file_nan)
    assert should_read, "Should match: overlap with upper bounds"


def test_inclusive_metrics_evaluator_not_in(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("all_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("max_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("max_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("min_max_nan", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("all_nan_null_bounds", (1, 10, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"

    should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, NotIn("some_nan_correct_bounds", (1, 30))).eval(data_file_nan)
    assert should_read, "Should match: no visibility"


def test_string_starts_with(
    schema_data_file: Schema, data_file: DataFile, data_file_2: DataFile, data_file_3: DataFile, data_file_4: DataFile
) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "a")).eval(data_file)
    assert should_read, "Should read: no stats"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "a")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aa")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aaa")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "1s")).eval(data_file_3)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "1str1x")).eval(data_file_3)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "ff")).eval(data_file_4)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "aB")).eval(data_file_2)
    assert not should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "dWX")).eval(data_file_2)
    assert not should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "5")).eval(data_file_3)
    assert not should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", "3str3x")).eval(data_file_3)
    assert not should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("some_empty", "房东整租霍")).eval(data_file)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("all_nulls", "")).eval(data_file)
    assert not should_read, "Should not read: range doesn't match"

    # above_max = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();

    # should_read = _InclusiveMetricsEvaluator(schema_data_file, StartsWith("required", above_max)).eval(data_file_4)
    # assert not should_read, "Should not read: range doesn't match"


def test_string_not_starts_with(
    schema_data_file: Schema, data_file: DataFile, data_file_2: DataFile, data_file_3: DataFile, data_file_4: DataFile
) -> None:
    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "a")).eval(data_file)
    assert should_read, "Should read: no stats"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "a")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aa")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aaa")).eval(data_file_2)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "1s")).eval(data_file_3)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "1str1x")).eval(data_file_3)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "ff")).eval(data_file_4)
    assert should_read, "Should read: range matches"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "aB")).eval(data_file_2)
    assert should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "dWX")).eval(data_file_2)
    assert should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "5")).eval(data_file_3)
    assert should_read, "Should not read: range doesn't match"

    should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", "3str3x")).eval(data_file_3)
    assert should_read, "Should not read: range doesn't match"

    # above_max = UnicodeUtil.truncateStringMax(Literal.of("イロハニホヘト"), 4).value().toString();

    # should_read = _InclusiveMetricsEvaluator(schema_data_file, NotStartsWith("required", above_max)).eval(data_file_4)
    # assert should_read, "Should not read: range doesn't match"


@pytest.fixture
def strict_data_file_schema() -> Schema:
    return Schema(
        NestedField(1, "id", IntegerType(), required=True),
        NestedField(2, "no_stats", IntegerType(), required=False),
        NestedField(3, "required", StringType(), required=True),
        NestedField(4, "all_nulls", StringType(), required=False),
        NestedField(5, "some_nulls", StringType(), required=False),
        NestedField(6, "no_nulls", StringType(), required=False),
        NestedField(7, "always_5", IntegerType(), required=False),
        NestedField(8, "all_nans", DoubleType(), required=False),
        NestedField(9, "some_nans", FloatType(), required=False),
        NestedField(10, "no_nans", FloatType(), required=False),
        NestedField(11, "all_nulls_double", DoubleType(), required=False),
        NestedField(12, "all_nans_v1_stats", FloatType(), required=False),
        NestedField(13, "nan_and_null_only", DoubleType(), required=False),
        NestedField(14, "no_nan_stats", DoubleType(), required=False),
    )


@pytest.fixture
def strict_data_file_1() -> DataFile:
    return DataFile(
        file_path="file_1.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={
            4: 50,
            5: 50,
            6: 50,
            8: 50,
            9: 50,
            10: 50,
            11: 50,
            12: 50,
            13: 50,
            14: 50,
        },
        null_value_counts={4: 50, 5: 10, 6: 0, 11: 50, 12: 0, 13: 1},
        nan_value_counts={
            8: 50,
            9: 10,
            10: 0,
        },
        lower_bounds={
            1: to_bytes(IntegerType(), INT_MIN_VALUE),
            7: to_bytes(IntegerType(), 5),
            12: to_bytes(FloatType(), float("nan")),
            13: to_bytes(DoubleType(), float("nan")),
        },
        upper_bounds={
            1: to_bytes(IntegerType(), INT_MAX_VALUE),
            7: to_bytes(IntegerType(), 5),
            12: to_bytes(FloatType(), float("nan")),
            14: to_bytes(DoubleType(), float("nan")),
        },
    )


@pytest.fixture
def strict_data_file_2() -> DataFile:
    return DataFile(
        file_path="file_2.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={
            4: 50,
            5: 50,
            6: 50,
            8: 50,
        },
        null_value_counts={4: 50, 5: 10, 6: 0},
        nan_value_counts=None,
        lower_bounds={
            5: to_bytes(StringType(), "bbb"),
        },
        upper_bounds={
            5: to_bytes(StringType(), "eee"),
        },
    )


@pytest.fixture
def strict_data_file_3() -> DataFile:
    return DataFile(
        file_path="file_3.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        file_size_in_bytes=3,
        value_counts={
            4: 50,
            5: 50,
            6: 50,
        },
        null_value_counts={4: 50, 5: 10, 6: 0},
        nan_value_counts=None,
        lower_bounds={
            5: to_bytes(StringType(), "bbb"),
        },
        upper_bounds={
            5: to_bytes(StringType(), "eee"),
        },
    )


def test_strict_all_nulls(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("all_nulls")).eval(strict_data_file_1)
    assert not should_read, "Should not match: no non-null value in all null column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("some_nulls")).eval(strict_data_file_1)
    assert not should_read, "Should not match: column with some nulls contains a non-null value"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("no_nulls")).eval(strict_data_file_1)
    assert should_read, "Should match: non-null column contains no null values"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("all_nulls", "a")).eval(strict_data_file_1)
    assert should_read, "Should match: notEqual on all nulls column"


def test_strict_no_nulls(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("all_nulls")).eval(strict_data_file_1)
    assert should_read, "Should match: all values are null"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("some_nulls")).eval(strict_data_file_1)
    assert not should_read, "Should not match: not all values are null"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("no_nulls")).eval(strict_data_file_1)
    assert not should_read, "Should not match: no values are null"


def test_strict_some_nulls(strict_data_file_schema: Schema, strict_data_file_2: DataFile, strict_data_file_3: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("some_nulls", "ggg")).eval(strict_data_file_2)
    assert not should_read, "Should not match: lessThan on some nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("some_nulls", "ggg")).eval(strict_data_file_2)
    assert not should_read, "Should not match: lessThanOrEqual on some nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("some_nulls", "aaa")).eval(strict_data_file_2)
    assert not should_read, "Should not match: greaterThan on some nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("some_nulls", "bbb")).eval(
        strict_data_file_2
    )
    assert not should_read, "Should not match: greaterThanOrEqual on some nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("some_nulls", "bbb")).eval(strict_data_file_3)
    assert not should_read, "Should not match: equal on some nulls column"


def test_strict_is_nan(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nans")).eval(strict_data_file_1)
    assert should_read, "Should match: all values are nan"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("some_nans")).eval(strict_data_file_1)
    assert not should_read, "Should not match: at least one non-nan value in some nan column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("no_nans")).eval(strict_data_file_1)
    assert not should_read, "Should not match: at least one non-nan value in no nan column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nulls_double")).eval(strict_data_file_1)
    assert not should_read, "Should not match: at least one non-nan value in all null column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("no_nan_stats")).eval(strict_data_file_1)
    assert not should_read, "Should not match: cannot determine without nan stats"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("all_nans_v1_stats")).eval(strict_data_file_1)
    assert not should_read, "Should not match: cannot determine without nan stats"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNaN("nan_and_null_only")).eval(strict_data_file_1)
    assert not should_read, "Should not match: null values are not nan"


def test_strict_not_nan(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nans")).eval(strict_data_file_1)
    assert not should_read, "Should not match: all values are nan"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("some_nans")).eval(strict_data_file_1)
    assert not should_read, "Should not match: at least one nan value in some nan column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("no_nans")).eval(strict_data_file_1)
    assert should_read, "Should match: no value is nan"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nulls_double")).eval(strict_data_file_1)
    assert should_read, "Should match: no nan value in all null column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("no_nan_stats")).eval(strict_data_file_1)
    assert not should_read, "Should not match: cannot determine without nan stats"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("all_nans_v1_stats")).eval(strict_data_file_1)
    assert not should_read, "Should not match: all values are nan"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNaN("nan_and_null_only")).eval(strict_data_file_1)
    assert not should_read, "Should not match: null values are not nan"


def test_strict_required_column(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("required")).eval(strict_data_file_1)
    assert should_read, "Should match: required columns are always non-null"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, IsNull("required")).eval(strict_data_file_1)
    assert not should_read, "Should not match: required columns never contain null"


def test_strict_missing_column(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    with pytest.raises(ValueError) as exc_info:
        _ = _StrictMetricsEvaluator(strict_data_file_schema, NotNull("missing")).eval(strict_data_file_1)

    assert str(exc_info.value) == "Could not find field with name missing, case_sensitive=True"


def test_strict_missing_stats(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    no_stats_schema = Schema(
        NestedField(2, "no_stats", DoubleType(), required=False),
    )

    no_stats_file = DataFile(
        file_path="file_1.parquet",
        file_format=FileFormat.PARQUET,
        partition={},
        record_count=50,
        value_counts=None,
        null_value_counts=None,
        nan_value_counts=None,
        lower_bounds=None,
        upper_bounds=None,
    )

    expressions = [
        LessThan("no_stats", 5),
        LessThanOrEqual("no_stats", 30),
        EqualTo("no_stats", 70),
        GreaterThan("no_stats", 78),
        GreaterThanOrEqual("no_stats", 90),
        NotEqualTo("no_stats", 101),
        IsNull("no_stats"),
        NotNull("no_stats"),
        IsNaN("no_stats"),
        NotNaN("no_stats"),
    ]

    for expression in expressions:
        should_read = _StrictMetricsEvaluator(no_stats_schema, expression).eval(no_stats_file)
        assert not should_read, f"Should never match when stats are missing for expr: {expression}"


def test_strict_zero_record_file_stats(strict_data_file_schema: Schema) -> None:
    zero_record_data_file = DataFile(file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition={}, record_count=0)

    expressions = [
        LessThan("no_stats", 5),
        LessThanOrEqual("no_stats", 30),
        EqualTo("no_stats", 70),
        GreaterThan("no_stats", 78),
        GreaterThanOrEqual("no_stats", 90),
        NotEqualTo("no_stats", 101),
        IsNull("no_stats"),
        NotNull("no_stats"),
        IsNaN("no_stats"),
        NotNaN("no_stats"),
    ]

    for expression in expressions:
        should_read = _StrictMetricsEvaluator(strict_data_file_schema, expression).eval(zero_record_data_file)
        assert should_read, f"Should always match 0-record file: {expression}"


def test_strict_not(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(schema_data_file, Not(LessThan("id", INT_MIN_VALUE - 25))).eval(strict_data_file_1)
    assert should_read, "Should not match: not(false)"

    should_read = _StrictMetricsEvaluator(schema_data_file, Not(GreaterThan("id", INT_MIN_VALUE - 25))).eval(strict_data_file_1)
    assert not should_read, "Should match: not(true)"


def test_strict_and(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(
        schema_data_file, And(GreaterThan("id", INT_MIN_VALUE - 25), LessThanOrEqual("id", INT_MIN_VALUE))
    ).eval(strict_data_file_1)
    assert not should_read, "Should not match: range may not overlap data"

    should_read = _StrictMetricsEvaluator(
        schema_data_file, And(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
    ).eval(strict_data_file_1)
    assert not should_read, "Should not match: range does not overlap data"

    should_read = _StrictMetricsEvaluator(
        schema_data_file, And(LessThan("id", INT_MAX_VALUE + 6), GreaterThanOrEqual("id", INT_MIN_VALUE - 30))
    ).eval(strict_data_file_1)
    assert should_read, "Should match: range includes all data"


def test_strict_or(schema_data_file: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(
        schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE + 1))
    ).eval(strict_data_file_1)
    assert not should_read, "Should not match: no matching values"

    should_read = _StrictMetricsEvaluator(
        schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MAX_VALUE - 19))
    ).eval(strict_data_file_1)
    assert not should_read, "Should not match: some values do not match"

    should_read = _StrictMetricsEvaluator(
        schema_data_file, Or(LessThan("id", INT_MIN_VALUE - 25), GreaterThanOrEqual("id", INT_MIN_VALUE))
    ).eval(strict_data_file_1)
    assert should_read, "Should match: all values match >= 30"


def test_strict_integer_lt(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MIN_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: always false"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MIN_VALUE + 1)).eval(strict_data_file_1)
    assert not should_read, "Should not match: 32 and greater not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MAX_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: 79 not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThan("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
    assert should_read, "Should match: all values in range"


def test_strict_integer_lt_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MIN_VALUE - 1)).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: always false"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MIN_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: 31 and greater not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MAX_VALUE)).eval(strict_data_file_1)
    assert should_read, "Should match: all values in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, LessThanOrEqual("id", INT_MAX_VALUE + 1)).eval(
        strict_data_file_1
    )
    assert should_read, "Should match: all values in range"


def test_strict_integer_gt(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MAX_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: always false"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MAX_VALUE - 1)).eval(strict_data_file_1)
    assert not should_read, "Should not match: 77 and less not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MIN_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: 30 not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThan("id", INT_MIN_VALUE - 1)).eval(strict_data_file_1)
    assert should_read, "Should match: all values in range"


def test_strict_integer_gt_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MAX_VALUE + 1)).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: no values in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MAX_VALUE)).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: 78 and lower are not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MIN_VALUE + 1)).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: 30 not in range"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, GreaterThanOrEqual("id", INT_MIN_VALUE)).eval(
        strict_data_file_1
    )
    assert should_read, "Should match: all values in range"


def test_strict_integer_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MIN_VALUE - 25)).eval(strict_data_file_1)
    assert not should_read, "Should not match: all values != 5"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MIN_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some values != 30"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE - 4)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some values != 75"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some values != 79"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some values != 80"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, EqualTo("always_5", INT_MIN_VALUE - 25)).eval(
        strict_data_file_1
    )
    assert should_read, "Should match: all values == 5"


def test_strict_integer_not_eq(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE - 25)).eval(strict_data_file_1)
    assert should_read, "Should match: no values == 5"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE - 1)).eval(strict_data_file_1)
    assert should_read, "Should match: no values == 39"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MIN_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 30"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE - 4)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 75"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE)).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 79"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE + 1)).eval(strict_data_file_1)
    assert should_read, "Should match: no values == 80"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotEqualTo("id", INT_MAX_VALUE + 6)).eval(strict_data_file_1)
    assert should_read, "Should read: no values == 85"


def test_strict_integer_not_eq_rewritten(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE - 25))).eval(
        strict_data_file_1
    )
    assert should_read, "Should match: no values == 5"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE - 1))).eval(strict_data_file_1)
    assert should_read, "Should match: no values == 39"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MIN_VALUE))).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 30"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE - 4))).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 75"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE))).eval(strict_data_file_1)
    assert not should_read, "Should not match: some value may be == 79"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE + 1))).eval(strict_data_file_1)
    assert should_read, "Should match: no values == 80"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, Not(EqualTo("id", INT_MAX_VALUE + 6))).eval(strict_data_file_1)
    assert should_read, "Should read: no values == 85"


def test_strict_integer_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: all values != 5 and != 6"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some values != 30 and != 31"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some values != 75 and != 76"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some values != 78 and != 79"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some values != 80 and != 81)"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("always_5", {5, 6})).eval(strict_data_file_1)
    assert should_read, "Should match: all values == 5"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("all_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert not should_read, "Should not match: in on all nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert not should_read, "Should not match: in on some nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, In("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert not should_read, "Should not match: no_nulls field does not have bounds"


def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None:
    # should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(strict_data_file_1)
    # assert should_read, "Should match: all values != 5 and != 6"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some values may be == 30"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE - 4, INT_MAX_VALUE - 3})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some value may be == 75 or == 76"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE, INT_MAX_VALUE + 1})).eval(
        strict_data_file_1
    )
    assert not should_read, "Should not match: some value may be == 79"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MAX_VALUE + 1, INT_MAX_VALUE + 2})).eval(
        strict_data_file_1
    )
    assert should_read, "Should match: no values == 80 or == 81"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("always_5", {5, 6})).eval(strict_data_file_1)
    assert not should_read, "Should not match: all values == 5"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("all_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert should_read, "Should match: notIn on all nulls column"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("some_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert should_read, "Should match: notIn on some nulls column, 'bbb' > 'abc' and 'bbb' < 'def'"

    should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1)
    assert not should_read, "Should not match: no_nulls field does not have bounds"
