python/src/nanoarrow/iterator.py - arrow-nanoarrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import warnings
 from functools import cached_property
 from itertools import islice, repeat
 from typing import Iterable, Tuple

 from nanoarrow._lib import CArrayView, CArrowType
 from nanoarrow.c_array_stream import c_array_stream
 from nanoarrow.c_schema import c_schema, c_schema_view
 from nanoarrow.schema import Schema


 def iter_py(obj, schema=None) -> Iterable:
     """Iterate over items in zero or more arrays

     Returns an iterator over an array stream where each item is a
     Python representation of the next element.

     Paramters
     ---------
     obj : array stream-like
         An array-like or array stream-like object as sanitized by
         :func:`c_array_stream`.
     schema : schema-like, optional
         An optional schema, passed to :func:`c_array_stream`.

     Examples
     --------

     >>> import nanoarrow as na
     >>> from nanoarrow import iterator
     >>> array = na.c_array([1, 2, 3], na.int32())
     >>> list(iterator.iter_py(array))
     [1, 2, 3]
     """
     return PyIterator.get_iterator(obj, schema=schema)


 def iter_tuples(obj, schema=None) -> Iterable[Tuple]:
     """Iterate over rows in zero or more struct arrays

     Returns an iterator over an array stream of struct arrays (i.e.,
     record batches) where each item is a tuple of the items in each
     row. This is different than :func:`iter_py`, which encodes struct
     columns as dictionaries.

     Paramters
     ---------
     obj : array stream-like
         An array-like or array stream-like object as sanitized by
         :func:`c_array_stream`.
     schema : schema-like, optional
         An optional schema, passed to :func:`c_array_stream`.

     Examples
     --------

     >>> import nanoarrow as na
     >>> from nanoarrow import iterator
     >>> import pyarrow as pa
     >>> array = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
     >>> list(iterator.iter_tuples(array))
     [(1,), (2,), (3,)]
     """
     return RowTupleIterator.get_iterator(obj, schema=schema)


 def iter_array_views(obj, schema=None) -> Iterable[CArrayView]:
     """Iterate over prepared views of each array

     Returns an iterator which yields a :func:`c_array_view`
     for each chunk in ``obj``.

     Paramters
     ---------
     obj : array stream-like
         An array-like or array stream-like object as sanitized by
         :func:`c_array_stream`.
     schema : schema-like, optional
         An optional schema, passed to :func:`c_array_stream`.

     Examples
     --------

     >>> import nanoarrow as na
     >>> from nanoarrow import iterator
     >>> array = na.c_array([1, 2, 3], na.int32())
     >>> list(iterator.iter_array_views(array))
     [<nanoarrow.c_array.CArrayView>
     - storage_type: 'int32'
     - length: 3
     - offset: 0
     - null_count: 0
     - buffers[2]:
       - validity <bool[0 b] >
       - data <int32[12 b] 1 2 3>
     - dictionary: NULL
     - children[0]:]
     """
     with c_array_stream(obj, schema) as stream:
         for array in stream:
             yield array.view()


 class InvalidArrayWarning(UserWarning):
     pass


 class LossyConversionWarning(UserWarning):
     pass


 class UnregisteredExtensionWarning(UserWarning):
     pass


 class ArrayViewBaseIterator:
     """Base class for iterators and visitors that use an internal ArrowArrayView
     as the basis for conversion to Python objects. Intended for internal use.
     """

     def __init__(self, schema, *, array_view=None):
         self._schema = c_schema(schema)
         self._schema_view = c_schema_view(schema)

         if array_view is None:
             self._array_view = CArrayView.from_schema(self._schema)
         else:
             self._array_view = array_view

     @cached_property
     def schema(self) -> Schema:
         return Schema(self._schema)

     @cached_property
     def _object_label(self):
         if self._schema.name:
             return f"{self._schema.name} <{self._schema_view.type}>"
         else:
             return f"<unnamed {self._schema_view.type}>"

     def _contains_nulls(self):
         return (
             self._schema_view.nullable
             and len(self._array_view.buffer(0))
             and self._array_view.null_count != 0
         )

     def _set_array(self, array):
         self._array_view._set_array(array)
         return self

     def _warn(self, message, category):
         warnings.warn(f"{self._object_label}: {message}", category)


 class PyIterator(ArrayViewBaseIterator):
     """Iterate over the Python object version of values in an ArrowArrayView.
     Intended for internal use.
     """

     @classmethod
     def get_iterator(cls, obj, schema=None):
         with c_array_stream(obj, schema=schema) as stream:
             iterator = cls(stream._get_cached_schema())
             for array in stream:
                 iterator._set_array(array)
                 yield from iterator

     def __init__(self, schema, *, array_view=None):
         super().__init__(schema, array_view=array_view)

         self._children = list(
             map(self._make_child, self._schema.children, self._array_view.children)
         )

         if self._schema.dictionary is None:
             self._dictionary = None
         else:
             self._dictionary = self._make_child(
                 self._schema.dictionary, self._array_view.dictionary
             )

     def _make_child(self, schema, array_view):
         return type(self)(schema, array_view=array_view)

     @cached_property
     def _child_names(self):
         return [child.name for child in self._schema.children]

     def __iter__(self):
         """Iterate over all elements in the current chunk"""
         return self._iter_chunk(0, len(self._array_view))

     def _iter_chunk(self, offset, length):
         """Iterate over all elements in a slice of the current chunk"""
         # Check for an extension type first since this isn't reflected by
         # self._schema_view.type_id. Currently we just return the storage
         # iterator with a warning for extension types.
         maybe_extension_name = self._schema_view.extension_name
         if maybe_extension_name:
             self._warn(
                 f"Converting unregistered extension '{maybe_extension_name}' "
                 "as storage type",
                 UnregisteredExtensionWarning,
             )

         type_id = self._schema_view.type_id
         if type_id not in _ITEMS_ITER_LOOKUP:
             raise KeyError(
                 f"Can't resolve iterator for type '{self._schema_view.type}'"
             )

         factory = getattr(self, _ITEMS_ITER_LOOKUP[type_id])
         return factory(offset, length)

     def _dictionary_iter(self, offset, length):
         dictionary = list(
             self._dictionary._iter_chunk(0, len(self._dictionary._array_view))
         )
         for dict_index in self._primitive_iter(offset, length):
             yield None if dict_index is None else dictionary[dict_index]

     def _wrap_iter_nullable(self, validity, items):
         for is_valid, item in zip(validity, items):
             yield item if is_valid else None

     def _struct_tuple_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
         items = zip(*(child._iter_chunk(offset, length) for child in self._children))

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             return self._wrap_iter_nullable(validity, items)
         else:
             return items

     def _struct_iter(self, offset, length):
         names = self._child_names
         for item in self._struct_tuple_iter(offset, length):
             yield None if item is None else {key: val for key, val in zip(names, item)}

     def _list_iter(self, offset, length):
         view = self._array_view
         offset += view.offset

         offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
         starts = offsets[:-1]
         ends = offsets[1:]
         child = self._children[0]
         child_iter = child._iter_chunk(starts[0], ends[-1] - starts[0])

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             for is_valid, start, end in zip(validity, starts, ends):
                 item = list(islice(child_iter, end - start))
                 yield item if is_valid else None
         else:
             for start, end in zip(starts, ends):
                 yield list(islice(child_iter, end - start))

     def _fixed_size_list_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
         child = self._children[0]
         fixed_size = view.layout.child_size_elements
         child_iter = child._iter_chunk(offset * fixed_size, length * fixed_size)

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             for is_valid in validity:
                 item = list(islice(child_iter, fixed_size))
                 yield item if is_valid else None
         else:
             for _ in range(length):
                 yield list(islice(child_iter, fixed_size))

     def _string_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
         offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
         starts = offsets[:-1]
         ends = offsets[1:]
         data = memoryview(view.buffer(2))

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             for is_valid, start, end in zip(validity, starts, ends):
                 yield str(data[start:end], "UTF-8") if is_valid else None
         else:
             for start, end in zip(starts, ends):
                 yield str(data[start:end], "UTF-8")

     def _binary_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
         offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
         starts = offsets[:-1]
         ends = offsets[1:]
         data = memoryview(view.buffer(2))

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             for is_valid, start, end in zip(validity, starts, ends):
                 yield bytes(data[start:end]) if is_valid else None
         else:
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])

     def _decimal_iter(self, offset, length):
         from decimal import Context, Decimal
         from sys import byteorder

         storage = self._primitive_iter(offset, length)
         precision = self._schema_view.decimal_precision

         # The approach here it to use Decimal(<integer>).scaleb(-scale),
         # which is a balance between simplicity, performance, and
         # safety (ensuring that we stay independent from the global precision).
         # We cache the scaleb and context to avoid doing so in the loop (the
         # argument to scaleb is transformed to a decimal by the .scaleb()
         # implementation).
         #
         # It would probably be fastest to go straight from binary
         # to string to decimal, since creating a decimal from a string
         # appears to be the fastest constructor.
         scaleb = Decimal(-self._schema_view.decimal_scale)
         context = Context(prec=precision)

         for item in storage:
             if item is None:
                 yield None
             else:
                 int_value = int.from_bytes(item, byteorder)
                 yield Decimal(int_value).scaleb(scaleb, context)

     def _date_iter(self, offset, length):
         from datetime import date, timedelta

         storage = self._primitive_iter(offset, length)
         epoch = date(1970, 1, 1)

         if self._schema_view.type_id == CArrowType.DATE32:
             for item in storage:
                 if item is None:
                     yield item
                 else:
                     yield epoch + timedelta(item)
         else:
             for item in storage:
                 if item is None:
                     yield item
                 else:
                     yield epoch + timedelta(milliseconds=item)

     def _time_iter(self, offset, length):
         from datetime import time

         for item in self._iter_time_components(offset, length):
             if item is None:
                 yield None
             else:
                 days, hours, mins, secs, us = item
                 if days != 0:
                     self._warn("days != 0", InvalidArrayWarning)

                 yield time(hours, mins, secs, us)

     def _timestamp_iter(self, offset, length):
         from datetime import datetime

         epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
         parent = self._duration_iter(offset, length)

         tz = self._schema_view.timezone
         if tz:
             tz = _get_tzinfo(tz)

             for item in parent:
                 if item is None:
                     yield None
                 else:
                     yield (epoch + item).astimezone(tz)
         else:
             epoch = epoch.replace(tzinfo=None)
             for item in parent:
                 if item is None:
                     yield None
                 else:
                     yield epoch + item

     def _duration_iter(self, offset, length):
         from datetime import timedelta

         storage = self._primitive_iter(offset, length)

         unit = self._schema_view.time_unit
         if unit == "s":
             to_us = 1_000_000
         elif unit == "ms":
             to_us = 1000
         elif unit == "us":
             to_us = 1
         elif unit == "ns":
             storage = self._iter_us_from_ns(storage)
             to_us = 1

         for item in storage:
             if item is None:
                 yield None
             else:
                 yield timedelta(microseconds=item * to_us)

     def _iter_time_components(self, offset, length):
         storage = self._primitive_iter(offset, length)

         unit = self._schema_view.time_unit
         if unit == "s":
             to_us = 1_000_000
         elif unit == "ms":
             to_us = 1000
         elif unit == "us":
             to_us = 1
         elif unit == "ns":
             storage = self._iter_us_from_ns(storage)
             to_us = 1

         us_per_sec = 1_000_000
         us_per_min = us_per_sec * 60
         us_per_hour = us_per_min * 60
         us_per_day = us_per_hour * 24

         for item in storage:
             if item is None:
                 yield None
             else:
                 us = item * to_us
                 days = us // us_per_day
                 us = us % us_per_day
                 hours = us // us_per_hour
                 us = us % us_per_hour
                 mins = us // us_per_min
                 us = us % us_per_min
                 secs = us // us_per_sec
                 us = us % us_per_sec
                 yield days, hours, mins, secs, us

     def _iter_us_from_ns(self, parent):
         for item in parent:
             if item is None:
                 yield None
             else:
                 if item % 1000 != 0:
                     self._warn("nanoseconds discarded", LossyConversionWarning)
                 yield item // 1000

     def _primitive_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
         items = view.buffer(1).elements(offset, length)

         if self._contains_nulls():
             validity = view.buffer(0).elements(offset, length)
             return self._wrap_iter_nullable(validity, items)
         else:
             return iter(items)

     def _null_iter(self, offset, length):
         return repeat(None, length)


 class RowTupleIterator(PyIterator):
     """Iterate over rows of a struct array (stream) where each row is a
     tuple instead of a dictionary. This is usually faster and matches other
     Python concepts more closely (e.g., dbapi's cursor, pandas itertuples).
     Intended for internal use.
     """

     def __init__(self, schema, *, array_view=None):
         super().__init__(schema, array_view=array_view)
         if self._schema_view.type != "struct":
             raise TypeError(
                 "RowTupleIterator can only iterate over struct arrays "
                 f"(got '{self._schema_view.type}')"
             )

     def _make_child(self, schema, array_view):
         return PyIterator(schema, array_view=array_view)

     def _iter_chunk(self, offset, length):
         return self._struct_tuple_iter(offset, length)


 def _get_tzinfo(tz_string, strategy=None):
     import re
     from datetime import timedelta, timezone

     # We can handle UTC without any imports
     if tz_string.upper() == "UTC":
         return timezone.utc

     # Arrow also allows fixed-offset in the from +HH:MM
     maybe_fixed_offset = re.search(r"^([+-])([0-9]{2}):([0-9]{2})$", tz_string)
     if maybe_fixed_offset:
         sign, hours, minutes = maybe_fixed_offset.groups()
         sign = 1 if sign == "+" else -1
         return timezone(sign * timedelta(hours=int(hours), minutes=int(minutes)))

     # Try zoneinfo.ZoneInfo() (Python 3.9+)
     if strategy is None or "zoneinfo" in strategy:
         try:
             from zoneinfo import ZoneInfo

             return ZoneInfo(tz_string)
         except ImportError:
             pass

     # Try dateutil.tz.gettz()
     if strategy is None or "dateutil" in strategy:
         try:
             from dateutil.tz import gettz

             return gettz(tz_string)
         except ImportError:
             pass

     raise RuntimeError(
         "zoneinfo (Python 3.9+, with tzdata on Windows) or "
         "dateutil is required to resolve timezone"
     )


 _ITEMS_ITER_LOOKUP = {
     CArrowType.NA: "_null_iter",
     CArrowType.BINARY: "_binary_iter",
     CArrowType.LARGE_BINARY: "_binary_iter",
     CArrowType.STRING: "_string_iter",
     CArrowType.LARGE_STRING: "_string_iter",
     CArrowType.STRUCT: "_struct_iter",
     CArrowType.LIST: "_list_iter",
     CArrowType.LARGE_LIST: "_list_iter",
     CArrowType.FIXED_SIZE_LIST: "_fixed_size_list_iter",
     CArrowType.DICTIONARY: "_dictionary_iter",
     CArrowType.DATE32: "_date_iter",
     CArrowType.DATE64: "_date_iter",
     CArrowType.TIME32: "_time_iter",
     CArrowType.TIME64: "_time_iter",
     CArrowType.TIMESTAMP: "_timestamp_iter",
     CArrowType.DURATION: "_duration_iter",
     CArrowType.DECIMAL128: "_decimal_iter",
     CArrowType.DECIMAL256: "_decimal_iter",
 }

 _PRIMITIVE_TYPE_NAMES = [
     "BOOL",
     "UINT8",
     "INT8",
     "UINT16",
     "INT16",
     "UINT32",
     "INT32",
     "UINT64",
     "INT64",
     "HALF_FLOAT",
     "FLOAT",
     "DOUBLE",
     "FIXED_SIZE_BINARY",
     "INTERVAL_MONTHS",
     "INTERVAL_DAY_TIME",
     "INTERVAL_MONTH_DAY_NANO",
 ]

 for type_name in _PRIMITIVE_TYPE_NAMES:
     type_id = getattr(CArrowType, type_name)
     _ITEMS_ITER_LOOKUP[type_id] = "_primitive_iter"
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import warnings
	from functools import cached_property
	from itertools import islice, repeat
	from typing import Iterable, Tuple

	from nanoarrow._lib import CArrayView, CArrowType
	from nanoarrow.c_array_stream import c_array_stream
	from nanoarrow.c_schema import c_schema, c_schema_view
	from nanoarrow.schema import Schema


	def iter_py(obj, schema=None) -> Iterable:
	"""Iterate over items in zero or more arrays

	Returns an iterator over an array stream where each item is a
	Python representation of the next element.

	Paramters
	---------
	obj : array stream-like
	An array-like or array stream-like object as sanitized by
	:func:`c_array_stream`.
	schema : schema-like, optional
	An optional schema, passed to :func:`c_array_stream`.

	Examples
	--------

	>>> import nanoarrow as na
	>>> from nanoarrow import iterator
	>>> array = na.c_array([1, 2, 3], na.int32())
	>>> list(iterator.iter_py(array))
	[1, 2, 3]
	"""
	return PyIterator.get_iterator(obj, schema=schema)


	def iter_tuples(obj, schema=None) -> Iterable[Tuple]:
	"""Iterate over rows in zero or more struct arrays

	Returns an iterator over an array stream of struct arrays (i.e.,
	record batches) where each item is a tuple of the items in each
	row. This is different than :func:`iter_py`, which encodes struct
	columns as dictionaries.

	Paramters
	---------
	obj : array stream-like
	An array-like or array stream-like object as sanitized by
	:func:`c_array_stream`.
	schema : schema-like, optional
	An optional schema, passed to :func:`c_array_stream`.

	Examples
	--------

	>>> import nanoarrow as na
	>>> from nanoarrow import iterator
	>>> import pyarrow as pa
	>>> array = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
	>>> list(iterator.iter_tuples(array))
	[(1,), (2,), (3,)]
	"""
	return RowTupleIterator.get_iterator(obj, schema=schema)


	def iter_array_views(obj, schema=None) -> Iterable[CArrayView]:
	"""Iterate over prepared views of each array

	Returns an iterator which yields a :func:`c_array_view`
	for each chunk in ``obj``.

	Paramters
	---------
	obj : array stream-like
	An array-like or array stream-like object as sanitized by
	:func:`c_array_stream`.
	schema : schema-like, optional
	An optional schema, passed to :func:`c_array_stream`.

	Examples
	--------

	>>> import nanoarrow as na
	>>> from nanoarrow import iterator
	>>> array = na.c_array([1, 2, 3], na.int32())
	>>> list(iterator.iter_array_views(array))
	[<nanoarrow.c_array.CArrayView>
	- storage_type: 'int32'
	- length: 3
	- offset: 0
	- null_count: 0
	- buffers[2]:
	- validity <bool[0 b] >
	- data <int32[12 b] 1 2 3>
	- dictionary: NULL
	- children[0]:]
	"""
	with c_array_stream(obj, schema) as stream:
	for array in stream:
	yield array.view()


	class InvalidArrayWarning(UserWarning):
	pass


	class LossyConversionWarning(UserWarning):
	pass


	class UnregisteredExtensionWarning(UserWarning):
	pass


	class ArrayViewBaseIterator:
	"""Base class for iterators and visitors that use an internal ArrowArrayView
	as the basis for conversion to Python objects. Intended for internal use.
	"""

	def __init__(self, schema, *, array_view=None):
	self._schema = c_schema(schema)
	self._schema_view = c_schema_view(schema)

	if array_view is None:
	self._array_view = CArrayView.from_schema(self._schema)
	else:
	self._array_view = array_view

	@cached_property
	def schema(self) -> Schema:
	return Schema(self._schema)

	@cached_property
	def _object_label(self):
	if self._schema.name:
	return f"{self._schema.name} <{self._schema_view.type}>"
	else:
	return f"<unnamed {self._schema_view.type}>"

	def _contains_nulls(self):
	return (
	self._schema_view.nullable
	and len(self._array_view.buffer(0))
	and self._array_view.null_count != 0
	)

	def _set_array(self, array):
	self._array_view._set_array(array)
	return self

	def _warn(self, message, category):
	warnings.warn(f"{self._object_label}: {message}", category)


	class PyIterator(ArrayViewBaseIterator):
	"""Iterate over the Python object version of values in an ArrowArrayView.
	Intended for internal use.
	"""

	@classmethod
	def get_iterator(cls, obj, schema=None):
	with c_array_stream(obj, schema=schema) as stream:
	iterator = cls(stream._get_cached_schema())
	for array in stream:
	iterator._set_array(array)
	yield from iterator

	def __init__(self, schema, *, array_view=None):
	super().__init__(schema, array_view=array_view)

	self._children = list(
	map(self._make_child, self._schema.children, self._array_view.children)
	)

	if self._schema.dictionary is None:
	self._dictionary = None
	else:
	self._dictionary = self._make_child(
	self._schema.dictionary, self._array_view.dictionary
	)

	def _make_child(self, schema, array_view):
	return type(self)(schema, array_view=array_view)

	@cached_property
	def _child_names(self):
	return [child.name for child in self._schema.children]

	def __iter__(self):
	"""Iterate over all elements in the current chunk"""
	return self._iter_chunk(0, len(self._array_view))

	def _iter_chunk(self, offset, length):
	"""Iterate over all elements in a slice of the current chunk"""
	# Check for an extension type first since this isn't reflected by
	# self._schema_view.type_id. Currently we just return the storage
	# iterator with a warning for extension types.
	maybe_extension_name = self._schema_view.extension_name
	if maybe_extension_name:
	self._warn(
	f"Converting unregistered extension '{maybe_extension_name}' "
	"as storage type",
	UnregisteredExtensionWarning,
	)

	type_id = self._schema_view.type_id
	if type_id not in _ITEMS_ITER_LOOKUP:
	raise KeyError(
	f"Can't resolve iterator for type '{self._schema_view.type}'"
	)

	factory = getattr(self, _ITEMS_ITER_LOOKUP[type_id])
	return factory(offset, length)

	def _dictionary_iter(self, offset, length):
	dictionary = list(
	self._dictionary._iter_chunk(0, len(self._dictionary._array_view))
	)
	for dict_index in self._primitive_iter(offset, length):
	yield None if dict_index is None else dictionary[dict_index]

	def _wrap_iter_nullable(self, validity, items):
	for is_valid, item in zip(validity, items):
	yield item if is_valid else None

	def _struct_tuple_iter(self, offset, length):
	view = self._array_view
	offset += view.offset
	items = zip(*(child._iter_chunk(offset, length) for child in self._children))

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	return self._wrap_iter_nullable(validity, items)
	else:
	return items

	def _struct_iter(self, offset, length):
	names = self._child_names
	for item in self._struct_tuple_iter(offset, length):
	yield None if item is None else {key: val for key, val in zip(names, item)}

	def _list_iter(self, offset, length):
	view = self._array_view
	offset += view.offset

	offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
	starts = offsets[:-1]
	ends = offsets[1:]
	child = self._children[0]
	child_iter = child._iter_chunk(starts[0], ends[-1] - starts[0])

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	for is_valid, start, end in zip(validity, starts, ends):
	item = list(islice(child_iter, end - start))
	yield item if is_valid else None
	else:
	for start, end in zip(starts, ends):
	yield list(islice(child_iter, end - start))

	def _fixed_size_list_iter(self, offset, length):
	view = self._array_view
	offset += view.offset
	child = self._children[0]
	fixed_size = view.layout.child_size_elements
	child_iter = child._iter_chunk(offset * fixed_size, length * fixed_size)

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	for is_valid in validity:
	item = list(islice(child_iter, fixed_size))
	yield item if is_valid else None
	else:
	for _ in range(length):
	yield list(islice(child_iter, fixed_size))

	def _string_iter(self, offset, length):
	view = self._array_view
	offset += view.offset
	offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
	starts = offsets[:-1]
	ends = offsets[1:]
	data = memoryview(view.buffer(2))

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	for is_valid, start, end in zip(validity, starts, ends):
	yield str(data[start:end], "UTF-8") if is_valid else None
	else:
	for start, end in zip(starts, ends):
	yield str(data[start:end], "UTF-8")

	def _binary_iter(self, offset, length):
	view = self._array_view
	offset += view.offset
	offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
	starts = offsets[:-1]
	ends = offsets[1:]
	data = memoryview(view.buffer(2))

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	for is_valid, start, end in zip(validity, starts, ends):
	yield bytes(data[start:end]) if is_valid else None
	else:
	for start, end in zip(starts, ends):
	yield bytes(data[start:end])

	def _decimal_iter(self, offset, length):
	from decimal import Context, Decimal
	from sys import byteorder

	storage = self._primitive_iter(offset, length)
	precision = self._schema_view.decimal_precision

	# The approach here it to use Decimal(<integer>).scaleb(-scale),
	# which is a balance between simplicity, performance, and
	# safety (ensuring that we stay independent from the global precision).
	# We cache the scaleb and context to avoid doing so in the loop (the
	# argument to scaleb is transformed to a decimal by the .scaleb()
	# implementation).
	#
	# It would probably be fastest to go straight from binary
	# to string to decimal, since creating a decimal from a string
	# appears to be the fastest constructor.
	scaleb = Decimal(-self._schema_view.decimal_scale)
	context = Context(prec=precision)

	for item in storage:
	if item is None:
	yield None
	else:
	int_value = int.from_bytes(item, byteorder)
	yield Decimal(int_value).scaleb(scaleb, context)

	def _date_iter(self, offset, length):
	from datetime import date, timedelta

	storage = self._primitive_iter(offset, length)
	epoch = date(1970, 1, 1)

	if self._schema_view.type_id == CArrowType.DATE32:
	for item in storage:
	if item is None:
	yield item
	else:
	yield epoch + timedelta(item)
	else:
	for item in storage:
	if item is None:
	yield item
	else:
	yield epoch + timedelta(milliseconds=item)

	def _time_iter(self, offset, length):
	from datetime import time

	for item in self._iter_time_components(offset, length):
	if item is None:
	yield None
	else:
	days, hours, mins, secs, us = item
	if days != 0:
	self._warn("days != 0", InvalidArrayWarning)

	yield time(hours, mins, secs, us)

	def _timestamp_iter(self, offset, length):
	from datetime import datetime

	epoch = datetime(1970, 1, 1, tzinfo=_get_tzinfo("UTC"))
	parent = self._duration_iter(offset, length)

	tz = self._schema_view.timezone
	if tz:
	tz = _get_tzinfo(tz)

	for item in parent:
	if item is None:
	yield None
	else:
	yield (epoch + item).astimezone(tz)
	else:
	epoch = epoch.replace(tzinfo=None)
	for item in parent:
	if item is None:
	yield None
	else:
	yield epoch + item

	def _duration_iter(self, offset, length):
	from datetime import timedelta

	storage = self._primitive_iter(offset, length)

	unit = self._schema_view.time_unit
	if unit == "s":
	to_us = 1_000_000
	elif unit == "ms":
	to_us = 1000
	elif unit == "us":
	to_us = 1
	elif unit == "ns":
	storage = self._iter_us_from_ns(storage)
	to_us = 1

	for item in storage:
	if item is None:
	yield None
	else:
	yield timedelta(microseconds=item * to_us)

	def _iter_time_components(self, offset, length):
	storage = self._primitive_iter(offset, length)

	unit = self._schema_view.time_unit
	if unit == "s":
	to_us = 1_000_000
	elif unit == "ms":
	to_us = 1000
	elif unit == "us":
	to_us = 1
	elif unit == "ns":
	storage = self._iter_us_from_ns(storage)
	to_us = 1

	us_per_sec = 1_000_000
	us_per_min = us_per_sec * 60
	us_per_hour = us_per_min * 60
	us_per_day = us_per_hour * 24

	for item in storage:
	if item is None:
	yield None
	else:
	us = item * to_us
	days = us // us_per_day
	us = us % us_per_day
	hours = us // us_per_hour
	us = us % us_per_hour
	mins = us // us_per_min
	us = us % us_per_min
	secs = us // us_per_sec
	us = us % us_per_sec
	yield days, hours, mins, secs, us

	def _iter_us_from_ns(self, parent):
	for item in parent:
	if item is None:
	yield None
	else:
	if item % 1000 != 0:
	self._warn("nanoseconds discarded", LossyConversionWarning)
	yield item // 1000

	def _primitive_iter(self, offset, length):
	view = self._array_view
	offset += view.offset
	items = view.buffer(1).elements(offset, length)

	if self._contains_nulls():
	validity = view.buffer(0).elements(offset, length)
	return self._wrap_iter_nullable(validity, items)
	else:
	return iter(items)

	def _null_iter(self, offset, length):
	return repeat(None, length)


	class RowTupleIterator(PyIterator):
	"""Iterate over rows of a struct array (stream) where each row is a
	tuple instead of a dictionary. This is usually faster and matches other
	Python concepts more closely (e.g., dbapi's cursor, pandas itertuples).
	Intended for internal use.
	"""

	def __init__(self, schema, *, array_view=None):
	super().__init__(schema, array_view=array_view)
	if self._schema_view.type != "struct":
	raise TypeError(
	"RowTupleIterator can only iterate over struct arrays "
	f"(got '{self._schema_view.type}')"
	)

	def _make_child(self, schema, array_view):
	return PyIterator(schema, array_view=array_view)

	def _iter_chunk(self, offset, length):
	return self._struct_tuple_iter(offset, length)


	def _get_tzinfo(tz_string, strategy=None):
	import re
	from datetime import timedelta, timezone

	# We can handle UTC without any imports
	if tz_string.upper() == "UTC":
	return timezone.utc

	# Arrow also allows fixed-offset in the from +HH:MM
	maybe_fixed_offset = re.search(r"^([+-])([0-9]{2}):([0-9]{2})$", tz_string)
	if maybe_fixed_offset:
	sign, hours, minutes = maybe_fixed_offset.groups()
	sign = 1 if sign == "+" else -1
	return timezone(sign * timedelta(hours=int(hours), minutes=int(minutes)))

	# Try zoneinfo.ZoneInfo() (Python 3.9+)
	if strategy is None or "zoneinfo" in strategy:
	try:
	from zoneinfo import ZoneInfo

	return ZoneInfo(tz_string)
	except ImportError:
	pass

	# Try dateutil.tz.gettz()
	if strategy is None or "dateutil" in strategy:
	try:
	from dateutil.tz import gettz

	return gettz(tz_string)
	except ImportError:
	pass

	raise RuntimeError(
	"zoneinfo (Python 3.9+, with tzdata on Windows) or "
	"dateutil is required to resolve timezone"
	)


	_ITEMS_ITER_LOOKUP = {
	CArrowType.NA: "_null_iter",
	CArrowType.BINARY: "_binary_iter",
	CArrowType.LARGE_BINARY: "_binary_iter",
	CArrowType.STRING: "_string_iter",
	CArrowType.LARGE_STRING: "_string_iter",
	CArrowType.STRUCT: "_struct_iter",
	CArrowType.LIST: "_list_iter",
	CArrowType.LARGE_LIST: "_list_iter",
	CArrowType.FIXED_SIZE_LIST: "_fixed_size_list_iter",
	CArrowType.DICTIONARY: "_dictionary_iter",
	CArrowType.DATE32: "_date_iter",
	CArrowType.DATE64: "_date_iter",
	CArrowType.TIME32: "_time_iter",
	CArrowType.TIME64: "_time_iter",
	CArrowType.TIMESTAMP: "_timestamp_iter",
	CArrowType.DURATION: "_duration_iter",
	CArrowType.DECIMAL128: "_decimal_iter",
	CArrowType.DECIMAL256: "_decimal_iter",
	}

	_PRIMITIVE_TYPE_NAMES = [
	"BOOL",
	"UINT8",
	"INT8",
	"UINT16",
	"INT16",
	"UINT32",
	"INT32",
	"UINT64",
	"INT64",
	"HALF_FLOAT",
	"FLOAT",
	"DOUBLE",
	"FIXED_SIZE_BINARY",
	"INTERVAL_MONTHS",
	"INTERVAL_DAY_TIME",
	"INTERVAL_MONTH_DAY_NANO",
	]

	for type_name in _PRIMITIVE_TYPE_NAMES:
	type_id = getattr(CArrowType, type_name)
	_ITEMS_ITER_LOOKUP[type_id] = "_primitive_iter"