blob: 3591847ad6b69058fef70aa370bbd542b0631153 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint:disable=redefined-outer-name,eval-used
import pytest
from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile
from pyiceberg.table.snapshots import Operation, Snapshot, SnapshotSummaryCollector, Summary, update_snapshot_summaries
@pytest.fixture
def snapshot() -> Snapshot:
return Snapshot(
snapshot_id=25,
parent_snapshot_id=19,
sequence_number=200,
timestamp_ms=1602638573590,
manifest_list="s3:/a/b/c.avro",
summary=Summary(Operation.APPEND),
schema_id=3,
)
@pytest.fixture
def snapshot_with_properties() -> Snapshot:
return Snapshot(
snapshot_id=25,
parent_snapshot_id=19,
sequence_number=200,
timestamp_ms=1602638573590,
manifest_list="s3:/a/b/c.avro",
summary=Summary(Operation.APPEND, foo="bar"),
schema_id=3,
)
def test_serialize_summary() -> None:
assert Summary(Operation.APPEND).model_dump_json() == """{"operation":"append"}"""
def test_serialize_summary_with_properties() -> None:
summary = Summary(Operation.APPEND, property="yes")
assert summary.model_dump_json() == """{"operation":"append","property":"yes"}"""
def test_serialize_snapshot(snapshot: Snapshot) -> None:
assert (
snapshot.model_dump_json()
== """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}"""
)
def test_serialize_snapshot_without_sequence_number() -> None:
snapshot = Snapshot(
snapshot_id=25,
parent_snapshot_id=19,
timestamp_ms=1602638573590,
manifest_list="s3:/a/b/c.avro",
summary=Summary(Operation.APPEND),
schema_id=3,
)
actual = snapshot.model_dump_json()
expected = """{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}"""
assert actual == expected
def test_serialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None:
assert (
snapshot_with_properties.model_dump_json()
== """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}"""
)
def test_deserialize_summary() -> None:
summary = Summary.model_validate_json("""{"operation": "append"}""")
assert summary.operation == Operation.APPEND
def test_deserialize_summary_with_properties() -> None:
summary = Summary.model_validate_json("""{"operation": "append", "property": "yes"}""")
assert summary.operation == Operation.APPEND
assert summary.additional_properties == {"property": "yes"}
def test_deserialize_snapshot(snapshot: Snapshot) -> None:
payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {"operation": "append"}, "schema-id": 3}"""
actual = Snapshot.model_validate_json(payload)
assert actual == snapshot
def test_deserialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None:
payload = """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}"""
snapshot = Snapshot.model_validate_json(payload)
assert snapshot == snapshot_with_properties
def test_snapshot_repr(snapshot: Snapshot) -> None:
assert (
repr(snapshot)
== """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND), schema_id=3)"""
)
assert snapshot == eval(repr(snapshot))
def test_snapshot_with_properties_repr(snapshot_with_properties: Snapshot) -> None:
assert (
repr(snapshot_with_properties)
== """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)"""
)
assert snapshot_with_properties == eval(repr(snapshot_with_properties))
@pytest.fixture
def manifest_file() -> ManifestFile:
return ManifestFile(
content=ManifestContent.DATA,
manifest_length=100,
added_files_count=1,
existing_files_count=2,
deleted_files_count=3,
added_rows_count=100,
existing_rows_count=110,
deleted_rows_count=120,
)
@pytest.fixture
def data_file() -> DataFile:
return DataFile(
content=DataFileContent.DATA,
record_count=100,
file_size_in_bytes=1234,
)
def test_snapshot_summary_collector(data_file: DataFile) -> None:
ssc = SnapshotSummaryCollector()
assert ssc.build() == {}
ssc.add_file(data_file)
assert ssc.build() == {
'added-data-files': '1',
'added-files-size': '1234',
'added-records': '100',
}
def test_merge_snapshot_summaries_empty() -> None:
assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary(
operation=Operation.APPEND,
**{
'total-data-files': '0',
'total-delete-files': '0',
'total-records': '0',
'total-files-size': '0',
'total-position-deletes': '0',
'total-equality-deletes': '0',
},
)
def test_merge_snapshot_summaries_new_summary() -> None:
actual = update_snapshot_summaries(
summary=Summary(
operation=Operation.APPEND,
**{
'added-data-files': '1',
'added-delete-files': '2',
'added-equality-deletes': '3',
'added-files-size': '4',
'added-position-deletes': '5',
'added-records': '6',
},
)
)
expected = Summary(
operation=Operation.APPEND,
**{
'added-data-files': '1',
'added-delete-files': '2',
'added-equality-deletes': '3',
'added-files-size': '4',
'added-position-deletes': '5',
'added-records': '6',
'total-data-files': '1',
'total-delete-files': '2',
'total-records': '6',
'total-files-size': '4',
'total-position-deletes': '5',
'total-equality-deletes': '3',
},
)
assert actual == expected
def test_merge_snapshot_summaries_overwrite_summary() -> None:
actual = update_snapshot_summaries(
summary=Summary(
operation=Operation.OVERWRITE,
**{
'added-data-files': '1',
'added-delete-files': '2',
'added-equality-deletes': '3',
'added-files-size': '4',
'added-position-deletes': '5',
'added-records': '6',
},
),
previous_summary={
'total-data-files': '1',
'total-delete-files': '1',
'total-equality-deletes': '1',
'total-files-size': '1',
'total-position-deletes': '1',
'total-records': '1',
},
truncate_full_table=True,
)
expected = {
'added-data-files': '1',
'added-delete-files': '2',
'added-equality-deletes': '3',
'added-files-size': '4',
'added-position-deletes': '5',
'added-records': '6',
'total-data-files': '1',
'total-records': '6',
'total-delete-files': '2',
'total-equality-deletes': '3',
'total-files-size': '4',
'total-position-deletes': '5',
'deleted-data-files': '1',
'removed-delete-files': '1',
'deleted-records': '1',
'removed-files-size': '1',
'removed-position-deletes': '1',
'removed-equality-deletes': '1',
}
assert actual.additional_properties == expected
def test_invalid_operation() -> None:
with pytest.raises(ValueError) as e:
update_snapshot_summaries(summary=Summary(Operation.REPLACE))
assert "Operation not implemented: Operation.REPLACE" in str(e.value)
with pytest.raises(ValueError) as e:
update_snapshot_summaries(summary=Summary(Operation.DELETE))
assert "Operation not implemented: Operation.DELETE" in str(e.value)
def test_invalid_type() -> None:
with pytest.raises(ValueError) as e:
update_snapshot_summaries(
summary=Summary(
operation=Operation.OVERWRITE,
**{
'added-data-files': '1',
'added-delete-files': '2',
'added-equality-deletes': '3',
'added-files-size': '4',
'added-position-deletes': '5',
'added-records': '6',
},
),
previous_summary={'total-data-files': 'abc'}, # should be a number
truncate_full_table=True,
)
assert "Could not parse summary property total-data-files to an int: abc" in str(e.value)