blob: 0c0e0474af3fc2b0fe90ec73613f2b10a2da41db [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import datetime
import decimal
import os
import pytest
from nanoarrow.iterator import (
ArrayViewBaseIterator,
InvalidArrayWarning,
LossyConversionWarning,
UnregisteredExtensionWarning,
iter_array_views,
iter_py,
iter_tuples,
)
import nanoarrow as na
def test_iterator_warnings():
msg_unnamed = "<unnamed int32>: something"
with pytest.warns(LossyConversionWarning, match=msg_unnamed):
ArrayViewBaseIterator(na.int32())._warn("something", LossyConversionWarning)
msg_named = "some_colname <int32>: something"
with pytest.warns(LossyConversionWarning, match=msg_named):
iterator = ArrayViewBaseIterator(na.Schema(na.Type.INT32, name="some_colname"))
iterator._warn("something", LossyConversionWarning)
def test_array_view_iterator():
array = na.c_array([1, 2, 3], na.int32())
views = list(iter_array_views(array))
assert len(views) == 1
assert views[0].storage_type == "int32"
assert list(views[0].buffer(1)) == [1, 2, 3]
def test_iterator_primitive():
array = na.c_array([1, 2, 3], na.int32())
assert list(iter_py(array)) == [1, 2, 3]
sliced = array[1:]
assert list(iter_py(sliced)) == [2, 3]
def test_iterator_nullable_primitive():
array = na.c_array([1, 2, 3, None], na.int32())
assert list(iter_py(array)) == [1, 2, 3, None]
sliced = array[1:]
assert list(iter_py(sliced)) == [2, 3, None]
@pytest.mark.parametrize(
"arrow_type", [na.string(), na.large_string(), na.string_view()]
)
def test_iterator_string(arrow_type):
array = na.c_array(["ab", "cde"], arrow_type)
assert list(iter_py(array)) == ["ab", "cde"]
sliced = array[1:]
assert list(iter_py(sliced)) == ["cde"]
@pytest.mark.parametrize(
"arrow_type", [na.string(), na.large_string(), na.string_view()]
)
def test_iterator_nullable_string(arrow_type):
array = na.c_array(["ab", "cde", None], arrow_type)
assert list(iter_py(array)) == ["ab", "cde", None]
sliced = array[1:]
assert list(iter_py(sliced)) == ["cde", None]
@pytest.mark.parametrize(
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
)
def test_iterator_binary(arrow_type):
array = na.c_array([b"ab", b"cde"], arrow_type)
assert list(iter_py(array)) == [b"ab", b"cde"]
sliced = array[1:]
assert list(iter_py(sliced)) == [b"cde"]
@pytest.mark.parametrize(
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
)
def test_iterator_nullable_binary(arrow_type):
array = na.c_array([b"ab", b"cde", None], arrow_type)
assert list(iter_py(array)) == [b"ab", b"cde", None]
sliced = array[1:]
assert list(iter_py(sliced)) == [b"cde", None]
def test_iter_tuples():
array = na.c_array_from_buffers(
na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=3,
buffers=[None],
children=[
na.c_array([1, 2, 3], na.int32()),
na.c_array([1, 0, 1], na.bool_()),
],
)
assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True)]
sliced = array[1:]
assert list(iter_tuples(sliced)) == [(2, False), (3, True)]
sliced_child = na.c_array_from_buffers(
array.schema,
length=2,
buffers=[None],
children=[array.child(0)[1:], array.child(1)[1:]],
)
assert list(iter_tuples(sliced_child)) == [(2, False), (3, True)]
nested_sliced = sliced_child[1:]
assert list(iter_tuples(nested_sliced)) == [(3, True)]
def test_iter_tuples_nullable():
array = na.c_array_from_buffers(
na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=4,
buffers=[na.c_buffer([True, True, True, False], na.bool_())],
children=[
na.c_array([1, 2, 3, 4], na.int32()),
na.c_array([1, 0, 1, 0], na.bool_()),
],
)
assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True), None]
sliced = array[1:]
assert list(iter_tuples(sliced)) == [(2, False), (3, True), None]
sliced_child = na.c_array_from_buffers(
array.schema,
length=3,
buffers=[na.c_buffer([True, True, False], na.bool_())],
children=[array.child(0)[1:], array.child(1)[1:]],
)
assert list(iter_tuples(sliced_child)) == [(2, False), (3, True), None]
nested_sliced = sliced_child[1:]
assert list(iter_tuples(nested_sliced)) == [(3, True), None]
def test_iter_tuples_errors():
with pytest.raises(TypeError, match="can only iterate over struct arrays"):
list(iter_tuples(na.c_array([1, 2, 3], na.int32())))
def test_iterator_struct():
array = na.c_array_from_buffers(
na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=3,
buffers=[None],
children=[
na.c_array([1, 2, 3], na.int32()),
na.c_array([1, 0, 1], na.bool_()),
],
)
assert list(iter_py(array)) == [
{"col1": 1, "col2": True},
{"col1": 2, "col2": False},
{"col1": 3, "col2": True},
]
sliced = array[1:]
assert list(iter_py(sliced)) == [
{"col1": 2, "col2": False},
{"col1": 3, "col2": True},
]
def test_iterator_nullable_struct():
array = na.c_array_from_buffers(
na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=4,
buffers=[na.c_buffer([True, True, True, False], na.bool_())],
children=[
na.c_array([1, 2, 3, 4], na.int32()),
na.c_array([1, 0, 1, 0], na.bool_()),
],
)
assert list(iter_py(array)) == [
{"col1": 1, "col2": True},
{"col1": 2, "col2": False},
{"col1": 3, "col2": True},
None,
]
sliced = array[1:]
assert list(iter_py(sliced)) == [
{"col1": 2, "col2": False},
{"col1": 3, "col2": True},
None,
]
def test_iterator_list():
pa = pytest.importorskip("pyarrow")
items = [[1, 2, 3], [4, 5, 6], [7, 8, None], [0]]
array = pa.array(items)
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], [0]]
array_sliced_child = pa.ListArray.from_arrays([0, 2, 5, 8, 9], array.values[1:])
assert (list(iter_py(array_sliced_child))) == [
[2, 3],
[4, 5, 6],
[7, 8, None],
[0],
]
nested_sliced = array_sliced_child[1:]
assert (list(iter_py(nested_sliced))) == [
[4, 5, 6],
[7, 8, None],
[0],
]
def test_iterator_nullable_list():
pa = pytest.importorskip("pyarrow")
items = [[1, 2, 3], [4, 5, 6], [7, 8, None], [0], None]
array = pa.array(items)
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], [0], None]
array_sliced_child = pa.ListArray.from_arrays(
[0, 2, 5, 8, 9, 9],
array.values[1:],
mask=pa.array([False, False, False, False, True]),
)
assert (list(iter_py(array_sliced_child))) == [
[2, 3],
[4, 5, 6],
[7, 8, None],
[0],
None,
]
nested_sliced = array_sliced_child[1:]
assert (list(iter_py(nested_sliced))) == [[4, 5, 6], [7, 8, None], [0], None]
def test_iterator_fixed_size_list():
pa = pytest.importorskip("pyarrow")
items = [[1, 2, 3], [4, 5, 6], [7, 8, None]]
array = pa.array(items, pa.list_(pa.int64(), 3))
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None]]
array_sliced_child = pa.FixedSizeListArray.from_arrays(array.values[3:], 3)
assert (list(iter_py(array_sliced_child))) == [[4, 5, 6], [7, 8, None]]
nested_sliced = array_sliced_child[1:]
assert (list(iter_py(nested_sliced))) == [[7, 8, None]]
def test_iterator_nullable_fixed_size_list():
pa = pytest.importorskip("pyarrow")
items = [[1, 2, 3], [4, 5, 6], [7, 8, None], None]
array = pa.array(items, pa.list_(pa.int64(), 3))
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], None]
# mask argument only available for pyarrow >= 15.0.0
array_sliced_child = pa.FixedSizeListArray.from_arrays(
array.values[3:], 3, mask=pa.array([False, False, True])
)
assert (list(iter_py(array_sliced_child))) == [[4, 5, 6], [7, 8, None], None]
nested_sliced = array_sliced_child[1:]
assert (list(iter_py(nested_sliced))) == [[7, 8, None], None]
def test_iterator_dictionary():
pa = pytest.importorskip("pyarrow")
items = ["ab", "cde", "ab", "def", "cde"]
array = pa.array(items).dictionary_encode()
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde"]
def test_iterator_nullable_dictionary():
pa = pytest.importorskip("pyarrow")
items = ["ab", "cde", "ab", "def", "cde", None]
array = pa.array(items).dictionary_encode()
assert list(iter_py(array)) == items
sliced = array[1:]
assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde", None]
def test_iterator_decimal():
pa = pytest.importorskip("pyarrow")
items = [decimal.Decimal("12.3450"), None, decimal.Decimal("1234567.3456")]
array = pa.array(items, pa.decimal128(11, 4))
assert list(iter_py(array)) == items
array = pa.array(items, pa.decimal256(11, 4))
assert list(iter_py(array)) == items
# Make sure this isn't affected by user-modified context
with decimal.localcontext(decimal.Context(prec=1)):
assert list(iter_py(array)) == items
def test_iterator_date():
pa = pytest.importorskip("pyarrow")
items = [
datetime.date(1970, 1, 2),
None,
datetime.date(2024, 4, 8),
]
array = pa.array(items, pa.date32())
assert list(iter_py(array)) == items
array = pa.array(items, pa.date64())
assert list(iter_py(array)) == items
def test_iterator_time():
pa = pytest.importorskip("pyarrow")
items = [
datetime.time(15, 45, 21, 12345),
None,
datetime.time(1, 23, 45),
]
array = pa.array(items, pa.time64("ns"))
assert list(iter_py(array)) == items
array = pa.array(items, pa.time64("us"))
assert list(iter_py(array)) == items
items[0] = datetime.time(15, 45, 21, 123000)
array = pa.array(items, pa.time32("ms"))
assert list(iter_py(array)) == items
items[0] = datetime.time(15, 45, 21)
array = pa.array(items, pa.time32("s"))
assert list(iter_py(array)) == items
def test_iterator_time_invalid():
time_invalid = na.c_array_from_buffers(
na.time32("s"), 1, [None, na.c_buffer([60 * 60 * 24], na.int32())]
)
with pytest.warns(InvalidArrayWarning):
list(iter_py(time_invalid))
def test_iterator_timestamp():
pa = pytest.importorskip("pyarrow")
items = [
datetime.datetime(1900, 1, 1, 11, 59, 1, 123),
None,
datetime.datetime(2050, 1, 1, 23, 59, 1, 0),
]
array = pa.array(items, pa.timestamp("ns"))
assert list(iter_py(array)) == items
array = pa.array(items, pa.timestamp("us"))
assert list(iter_py(array)) == items
items[0] = items[0].replace(microsecond=123000)
array = pa.array(items, pa.timestamp("ms"))
assert list(iter_py(array)) == items
items[0] = items[0].replace(microsecond=0)
array = pa.array(items, pa.timestamp("s"))
assert list(iter_py(array)) == items
def test_iterator_timestamp_tz():
from nanoarrow.iterator import _get_tzinfo
pa = pytest.importorskip("pyarrow")
tz = _get_tzinfo("America/Halifax")
items = [
datetime.datetime(1900, 1, 1, 11, 59, 1, 1234, tzinfo=tz),
None,
datetime.datetime(2050, 1, 1, 23, 59, 1, 0, tzinfo=tz),
]
array = pa.array(items, pa.timestamp("ns", "America/Halifax"))
assert list(iter_py(array)) == items
array = pa.array(items, pa.timestamp("us", "America/Halifax"))
assert list(iter_py(array)) == items
items[0] = items[0].replace(microsecond=123000)
array = pa.array(items, pa.timestamp("ms", "America/Halifax"))
assert list(iter_py(array)) == items
items[0] = items[0].replace(microsecond=0)
array = pa.array(items, pa.timestamp("s", "America/Halifax"))
assert list(iter_py(array)) == items
def test_iterator_lossy_timestamp():
datetime_with_ns = na.c_array_from_buffers(
na.timestamp("ns"), 1, [None, na.c_buffer([1], na.int64())]
)
with pytest.warns(LossyConversionWarning):
list(iter_py(datetime_with_ns))
def test_get_tzinfo():
from nanoarrow.iterator import _get_tzinfo
dt = datetime.datetime(2020, 1, 2, 3, 4, 5)
assert dt.replace(tzinfo=_get_tzinfo("UTC")).utcoffset() == datetime.timedelta(0)
assert dt.replace(tzinfo=_get_tzinfo("utc")).utcoffset() == datetime.timedelta(0)
assert dt.replace(tzinfo=_get_tzinfo("+03:30")).utcoffset() == datetime.timedelta(
hours=3, minutes=30
)
assert dt.replace(tzinfo=_get_tzinfo("-03:30")).utcoffset() == datetime.timedelta(
hours=-3, minutes=-30
)
pytest.importorskip("zoneinfo")
pytest.importorskip("dateutil")
if os.name == "nt":
pytest.importorskip("tzdata")
tz_zoneinfo = _get_tzinfo("America/Halifax", strategy=["zoneinfo"])
tz_dateutil = _get_tzinfo("America/Halifax", strategy=["dateutil"])
for tz in [tz_zoneinfo, tz_dateutil]:
assert dt.replace(tzinfo=tz).utcoffset() == datetime.timedelta(hours=-4)
with pytest.raises(RuntimeError):
_get_tzinfo("America/Halifax", strategy=[])
def test_iterator_duration():
pa = pytest.importorskip("pyarrow")
items = [
datetime.timedelta(days=-12, seconds=-345, microseconds=-6789),
None,
datetime.timedelta(days=12345, seconds=67890),
]
array = pa.array(items, pa.duration("ns"))
assert list(iter_py(array)) == items
array = pa.array(items, pa.duration("us"))
assert list(iter_py(array)) == items
items[0] = datetime.timedelta(days=-12, seconds=-345, microseconds=-678000)
array = pa.array(items, pa.duration("ms"))
assert list(iter_py(array)) == items
items[0] = datetime.timedelta(days=-12, seconds=-345)
array = pa.array(items, pa.duration("s"))
assert list(iter_py(array)) == items
def test_iterator_extension():
schema = na.extension_type(na.int32(), "arrow.test")
storage_array = na.c_array([1, 2, 3], na.int32())
_, storage_array_capsule = na.c_array(storage_array).__arrow_c_array__()
extension_array = na.c_array(storage_array_capsule, schema)
with pytest.warns(UnregisteredExtensionWarning):
assert list(iter_py(extension_array)) == [1, 2, 3]
def test_iterator_null():
array = na.c_array_from_buffers(na.null(), 3, [])
assert list(iter_py(array)) == [None, None, None]