blob: 4e040836979aa813234cae732cc8773cf342477d [file] [log] [blame]
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from pyarrow.compat import unittest, u # noqa
from pyarrow.pandas_compat import _pandas_api # noqa
import pyarrow as pa
import collections
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz
int_type_pairs = [
(np.int8, pa.int8()),
(np.int16, pa.int16()),
(np.int32, pa.int32()),
(np.int64, pa.int64()),
(np.uint8, pa.uint8()),
(np.uint16, pa.uint16()),
(np.uint32, pa.uint32()),
(np.uint64, pa.uint64())]
np_int_types, _ = zip(*int_type_pairs)
class StrangeIterable:
def __init__(self, lst):
self.lst = lst
def __iter__(self):
return self.lst.__iter__()
def check_struct_type(ty, expected):
"""
Check a struct type is as expected, but not taking order into account.
"""
assert pa.types.is_struct(ty)
assert set(ty) == set(expected)
def test_iterable_types():
arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
arr2 = pa.array((0, 1, 2, 3))
assert arr1.equals(arr2)
def test_empty_iterable():
arr = pa.array(StrangeIterable([]))
assert len(arr) == 0
assert arr.null_count == 0
assert arr.type == pa.null()
assert arr.to_pylist() == []
def test_limited_iterator_types():
arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
arr2 = pa.array((0, 1, 2))
assert arr1.equals(arr2)
def test_limited_iterator_size_overflow():
arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
arr2 = pa.array((0, 1))
assert arr1.equals(arr2)
def test_limited_iterator_size_underflow():
arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
arr2 = pa.array((0, 1, 2))
assert arr1.equals(arr2)
def test_iterator_without_size():
expected = pa.array((0, 1, 2))
arr1 = pa.array(iter(range(3)))
assert arr1.equals(expected)
# Same with explicit type
arr1 = pa.array(iter(range(3)), type=pa.int64())
assert arr1.equals(expected)
def test_infinite_iterator():
expected = pa.array((0, 1, 2))
arr1 = pa.array(itertools.count(0), size=3)
assert arr1.equals(expected)
# Same with explicit type
arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3)
assert arr1.equals(expected)
def _as_list(xs):
return xs
def _as_tuple(xs):
return tuple(xs)
def _as_deque(xs):
# deque is a sequence while neither tuple nor list
return collections.deque(xs)
def _as_dict_values(xs):
# a dict values object is not a sequence, just a regular iterable
dct = {k: v for k, v in enumerate(xs)}
return six.viewvalues(dct)
parametrize_with_iterable_types = pytest.mark.parametrize(
"seq", [_as_list, _as_tuple, _as_deque, _as_dict_values])
@parametrize_with_iterable_types
def test_sequence_types(seq):
arr1 = pa.array(seq([1, 2, 3]))
arr2 = pa.array([1, 2, 3])
assert arr1.equals(arr2)
@parametrize_with_iterable_types
def test_sequence_boolean(seq):
expected = [True, None, False, None]
arr = pa.array(seq(expected))
assert len(arr) == 4
assert arr.null_count == 2
assert arr.type == pa.bool_()
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
def test_sequence_numpy_boolean(seq):
expected = [np.bool(True), None, np.bool(False), None]
arr = pa.array(seq(expected))
assert len(arr) == 4
assert arr.null_count == 2
assert arr.type == pa.bool_()
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
def test_empty_list(seq):
arr = pa.array(seq([]))
assert len(arr) == 0
assert arr.null_count == 0
assert arr.type == pa.null()
assert arr.to_pylist() == []
@parametrize_with_iterable_types
def test_nested_lists(seq):
data = [[], [1, 2], None]
arr = pa.array(seq(data))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int64())
assert arr.to_pylist() == data
# With explicit type
arr = pa.array(seq(data), type=pa.list_(pa.int32()))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int32())
assert arr.to_pylist() == data
@parametrize_with_iterable_types
def test_list_with_non_list(seq):
# List types don't accept non-sequences
with pytest.raises(pa.ArrowTypeError):
pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
@parametrize_with_iterable_types
def test_nested_arrays(seq):
arr = pa.array(seq([np.array([], dtype=np.int64),
np.array([1, 2], dtype=np.int64), None]))
assert len(arr) == 3
assert arr.null_count == 1
assert arr.type == pa.list_(pa.int64())
assert arr.to_pylist() == [[], [1, 2], None]
@parametrize_with_iterable_types
def test_sequence_all_none(seq):
arr = pa.array(seq([None, None]))
assert len(arr) == 2
assert arr.null_count == 2
assert arr.type == pa.null()
assert arr.to_pylist() == [None, None]
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
expected = [1, None, 3, None,
np.iinfo(np_scalar).min, np.iinfo(np_scalar).max]
arr = pa.array(seq(expected), type=pa_type)
assert len(arr) == 6
assert arr.null_count == 2
assert arr.type == pa_type
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
_, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
arr = pa.array(seq([np.nan]), type=pa_type, from_pandas=True)
expected = [None]
assert len(arr) == 1
assert arr.null_count == 1
assert arr.type == pa_type
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
# ARROW-2806: numpy.nan is a double value and thus should produce
# a double array.
_, pa_type = np_scalar_pa_type
with pytest.raises(ValueError):
pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True)
expected = [[None]]
assert len(arr) == 1
assert arr.null_count == 0
assert arr.type == pa.list_(pa_type)
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
def test_sequence_integer_inferred(seq):
expected = [1, None, 3, None]
arr = pa.array(seq(expected))
assert len(arr) == 4
assert arr.null_count == 2
assert arr.type == pa.int64()
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
expected = [np_scalar(1), None, np_scalar(3), None,
np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
arr = pa.array(seq(expected), type=pa_type)
assert len(arr) == 6
assert arr.null_count == 2
assert arr.type == pa_type
assert arr.to_pylist() == expected
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
expected = [np_scalar(1), None, np_scalar(3), None]
expected += [np_scalar(np.iinfo(np_scalar).min),
np_scalar(np.iinfo(np_scalar).max)]
arr = pa.array(seq(expected))
assert len(arr) == 6
assert arr.null_count == 2
assert arr.type == pa_type
assert arr.to_pylist() == expected
def test_numpy_scalars_mixed_type():
# ARROW-4324
data = [np.int32(10), np.float32(0.5)]
arr = pa.array(data)
expected = pa.array([10, 0.5], type='float64')
assert arr.equals(expected)
@pytest.mark.xfail(reason="Type inference for uint64 not implemented",
raises=pa.ArrowException)
def test_uint64_max_convert():
data = [0, np.iinfo(np.uint64).max]
arr = pa.array(data, type=pa.uint64())
expected = pa.array(np.array(data, dtype='uint64'))
assert arr.equals(expected)
arr_inferred = pa.array(data)
assert arr_inferred.equals(expected)
@pytest.mark.parametrize("bits", [8, 16, 32, 64])
def test_signed_integer_overflow(bits):
ty = getattr(pa, "int%d" % bits)()
# XXX ideally would raise OverflowError
with pytest.raises((ValueError, pa.ArrowException)):
pa.array([2 ** (bits - 1)], ty)
with pytest.raises((ValueError, pa.ArrowException)):
pa.array([-2 ** (bits - 1) - 1], ty)
@pytest.mark.parametrize("bits", [8, 16, 32, 64])
def test_unsigned_integer_overflow(bits):
ty = getattr(pa, "uint%d" % bits)()
# XXX ideally would raise OverflowError
with pytest.raises((ValueError, pa.ArrowException)):
pa.array([2 ** bits], ty)
with pytest.raises((ValueError, pa.ArrowException)):
pa.array([-1], ty)
def test_convert_with_mask():
data = [1, 2, 3, 4, 5]
mask = np.array([False, True, False, False, True])
result = pa.array(data, mask=mask)
expected = pa.array([1, None, 3, 4, None])
assert result.equals(expected)
# Mask wrong length
with pytest.raises(ValueError):
pa.array(data, mask=mask[1:])
def test_garbage_collection():
import gc
# Force the cyclic garbage collector to run
gc.collect()
bytes_before = pa.total_allocated_bytes()
pa.array([1, None, 3, None])
gc.collect()
assert pa.total_allocated_bytes() == bytes_before
def test_sequence_double():
data = [1.5, 1., None, 2.5, None, None]
arr = pa.array(data)
assert len(arr) == 6
assert arr.null_count == 3
assert arr.type == pa.float64()
assert arr.to_pylist() == data
def test_double_auto_coerce_from_integer():
# Done as part of ARROW-2814
data = [1.5, 1., None, 2.5, None, None]
arr = pa.array(data)
data2 = [1.5, 1, None, 2.5, None, None]
arr2 = pa.array(data2)
assert arr.equals(arr2)
data3 = [1, 1.5, None, 2.5, None, None]
arr3 = pa.array(data3)
data4 = [1., 1.5, None, 2.5, None, None]
arr4 = pa.array(data4)
assert arr3.equals(arr4)
def test_double_integer_coerce_representable_range():
valid_values = [1.5, 1, 2, None, 1 << 53, -(1 << 53)]
invalid_values = [1.5, 1, 2, None, (1 << 53) + 1]
invalid_values2 = [1.5, 1, 2, None, -((1 << 53) + 1)]
# it works
pa.array(valid_values)
# it fails
with pytest.raises(ValueError):
pa.array(invalid_values)
with pytest.raises(ValueError):
pa.array(invalid_values2)
def test_float32_integer_coerce_representable_range():
f32 = np.float32
valid_values = [f32(1.5), 1 << 24, -(1 << 24)]
invalid_values = [f32(1.5), (1 << 24) + 1]
invalid_values2 = [f32(1.5), -((1 << 24) + 1)]
# it works
pa.array(valid_values, type=pa.float32())
# it fails
with pytest.raises(ValueError):
pa.array(invalid_values, type=pa.float32())
with pytest.raises(ValueError):
pa.array(invalid_values2, type=pa.float32())
def test_mixed_sequence_errors():
with pytest.raises(ValueError, match="tried to convert to boolean"):
pa.array([True, 'foo'], type=pa.bool_())
with pytest.raises(ValueError, match="tried to convert to float32"):
pa.array([1.5, 'foo'], type=pa.float32())
with pytest.raises(ValueError, match="tried to convert to double"):
pa.array([1.5, 'foo'])
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar,pa_type", [
(np.float16, pa.float16()),
(np.float32, pa.float32()),
(np.float64, pa.float64())
])
@pytest.mark.parametrize("from_pandas", [True, False])
def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
arr = pa.array(seq(data), from_pandas=from_pandas)
assert len(arr) == 6
if from_pandas:
assert arr.null_count == 3
else:
assert arr.null_count == 2
if from_pandas:
# The NaN is skipped in type inference, otherwise it forces a
# float64 promotion
assert arr.type == pa_type
else:
assert arr.type == pa.float64()
assert arr.to_pylist()[:4] == data[:4]
if from_pandas:
assert arr.to_pylist()[5] is None
else:
assert np.isnan(arr.to_pylist()[5])
@pytest.mark.parametrize("from_pandas", [True, False])
@pytest.mark.parametrize("inner_seq", [np.array, list])
def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
# ARROW-2806
data = np.array([
inner_seq([1., 2.]),
inner_seq([1., 2., 3.]),
inner_seq([np.nan]),
None
])
arr = pa.array(data, from_pandas=from_pandas)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.list_(pa.float64())
if from_pandas:
assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
else:
np.testing.assert_equal(arr.to_pylist(),
[[1., 2.], [1., 2., 3.], [np.nan], None])
def test_nested_ndarray_in_object_array():
# ARROW-4350
arr = np.empty(2, dtype=object)
arr[:] = [np.array([1, 2], dtype=np.int64),
np.array([2, 3], dtype=np.int64)]
arr2 = np.empty(2, dtype=object)
arr2[0] = [3, 4]
arr2[1] = [5, 6]
expected_type = pa.list_(pa.list_(pa.int64()))
assert pa.infer_type([arr]) == expected_type
result = pa.array([arr, arr2])
expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
type=expected_type)
assert result.equals(expected)
# test case for len-1 arrays to ensure they are interpreted as
# sublists and not scalars
arr = np.empty(2, dtype=object)
arr[:] = [np.array([1]), np.array([2])]
result = pa.array([arr, arr])
assert result.to_pylist() == [[[1], [2]], [[1], [2]]]
@pytest.mark.xfail(reason=("Type inference for multidimensional ndarray "
"not yet implemented"),
raises=AssertionError)
def test_multidimensional_ndarray_as_nested_list():
# TODO(wesm): see ARROW-5645
arr = np.array([[1, 2], [2, 3]], dtype=np.int64)
arr2 = np.array([[3, 4], [5, 6]], dtype=np.int64)
expected_type = pa.list_(pa.list_(pa.int64()))
assert pa.infer_type([arr]) == expected_type
result = pa.array([arr, arr2])
expected = pa.array([[[1, 2], [2, 3]], [[3, 4], [5, 6]]],
type=expected_type)
assert result.equals(expected)
def test_array_ignore_nan_from_pandas():
# See ARROW-4324, this reverts logic that was introduced in
# ARROW-2240
with pytest.raises(ValueError):
pa.array([np.nan, 'str'])
arr = pa.array([np.nan, 'str'], from_pandas=True)
expected = pa.array([None, 'str'])
assert arr.equals(expected)
def test_nested_ndarray_different_dtypes():
data = [
np.array([1, 2, 3], dtype='int64'),
None,
np.array([4, 5, 6], dtype='uint32')
]
arr = pa.array(data)
expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
type=pa.list_(pa.int64()))
assert arr.equals(expected)
t2 = pa.list_(pa.uint32())
arr2 = pa.array(data, type=t2)
expected2 = expected.cast(t2)
assert arr2.equals(expected2)
def test_sequence_unicode():
data = [u'foo', u'bar', None, u'mañana']
arr = pa.array(data)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.string()
assert arr.to_pylist() == data
def test_array_mixed_unicode_bytes():
values = [u'qux', b'foo', bytearray(b'barz')]
b_values = [b'qux', b'foo', b'barz']
u_values = [u'qux', u'foo', u'barz']
arr = pa.array(values)
expected = pa.array(b_values, type=pa.binary())
assert arr.type == pa.binary()
assert arr.equals(expected)
arr = pa.array(values, type=pa.string())
expected = pa.array(u_values, type=pa.string())
assert arr.type == pa.string()
assert arr.equals(expected)
def test_sequence_bytes():
u1 = b'ma\xc3\xb1ana'
data = [b'foo',
u1.decode('utf-8'), # unicode gets encoded,
bytearray(b'bar'),
None]
for ty in [None, pa.binary()]:
arr = pa.array(data, type=ty)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.binary()
assert arr.to_pylist() == [b'foo', u1, b'bar', None]
def test_sequence_utf8_to_unicode():
# ARROW-1225
data = [b'foo', None, b'bar']
arr = pa.array(data, type=pa.string())
assert arr[0].as_py() == u'foo'
# test a non-utf8 unicode string
val = (u'mañana').encode('utf-16-le')
with pytest.raises(pa.ArrowInvalid):
pa.array([val], type=pa.string())
def test_sequence_fixed_size_bytes():
data = [b'foof', None, bytearray(b'barb'), b'2346']
arr = pa.array(data, type=pa.binary(4))
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.binary(4)
assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
def test_fixed_size_bytes_does_not_accept_varying_lengths():
data = [b'foo', None, b'barb', b'2346']
with pytest.raises(pa.ArrowInvalid):
pa.array(data, type=pa.binary(4))
def test_sequence_date():
data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
datetime.date(2040, 2, 26)]
arr = pa.array(data)
assert len(arr) == 4
assert arr.type == pa.date32()
assert arr.null_count == 1
assert arr[0].as_py() == datetime.date(2000, 1, 1)
assert arr[1].as_py() is None
assert arr[2].as_py() == datetime.date(1970, 1, 1)
assert arr[3].as_py() == datetime.date(2040, 2, 26)
@pytest.mark.parametrize('input',
[(pa.date32(), [10957, None]),
(pa.date64(), [10957 * 86400000, None])])
def test_sequence_explicit_types(input):
t, ex_values = input
data = [datetime.date(2000, 1, 1), None]
arr = pa.array(data, type=t)
arr2 = pa.array(ex_values, type=t)
for x in [arr, arr2]:
assert len(x) == 2
assert x.type == t
assert x.null_count == 1
assert x[0].as_py() == datetime.date(2000, 1, 1)
assert x[1] is pa.NA
def test_date32_overflow():
# Overflow
data3 = [2**32, None]
with pytest.raises(pa.ArrowException):
pa.array(data3, type=pa.date32())
def test_sequence_timestamp():
data = [
datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
None,
datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
]
arr = pa.array(data)
assert len(arr) == 4
assert arr.type == pa.timestamp('us')
assert arr.null_count == 1
assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
assert arr[1].as_py() is None
assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
34, 56, 432539)
assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
46, 57, 437699)
def test_sequence_numpy_timestamp():
data = [
np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
None,
np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
]
arr = pa.array(data)
assert len(arr) == 4
assert arr.type == pa.timestamp('us')
assert arr.null_count == 1
assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
assert arr[1].as_py() is None
assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
34, 56, 432539)
assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
46, 57, 437699)
def test_sequence_timestamp_with_unit():
data = [
datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
]
s = pa.timestamp('s')
ms = pa.timestamp('ms')
us = pa.timestamp('us')
arr_s = pa.array(data, type=s)
assert len(arr_s) == 1
assert arr_s.type == s
assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 0)
arr_ms = pa.array(data, type=ms)
assert len(arr_ms) == 1
assert arr_ms.type == ms
assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123000)
arr_us = pa.array(data, type=us)
assert len(arr_us) == 1
assert arr_us.type == us
assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
class MyDate(datetime.date):
pass
class MyDatetime(datetime.datetime):
pass
def test_datetime_subclassing():
data = [
MyDate(2007, 7, 13),
]
date_type = pa.date32()
arr_date = pa.array(data, type=date_type)
assert len(arr_date) == 1
assert arr_date.type == date_type
assert arr_date[0].as_py() == datetime.date(2007, 7, 13)
data = [
MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
]
s = pa.timestamp('s')
ms = pa.timestamp('ms')
us = pa.timestamp('us')
arr_s = pa.array(data, type=s)
assert len(arr_s) == 1
assert arr_s.type == s
assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 0)
arr_ms = pa.array(data, type=ms)
assert len(arr_ms) == 1
assert arr_ms.type == ms
assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123000)
arr_us = pa.array(data, type=us)
assert len(arr_us) == 1
assert arr_us.type == us
assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
@pytest.mark.xfail(not _pandas_api.have_pandas,
reason="pandas required for nanosecond conversion")
def test_sequence_timestamp_nanoseconds():
inputs = [
[datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)],
[MyDatetime(2007, 7, 13, 1, 23, 34, 123456)]
]
for data in inputs:
ns = pa.timestamp('ns')
arr_ns = pa.array(data, type=ns)
assert len(arr_ns) == 1
assert arr_ns.type == ns
assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
@pytest.mark.pandas
def test_sequence_timestamp_from_int_with_unit():
# TODO(wesm): This test might be rewritten to assert the actual behavior
# when pandas is not installed
data = [1]
s = pa.timestamp('s')
ms = pa.timestamp('ms')
us = pa.timestamp('us')
ns = pa.timestamp('ns')
arr_s = pa.array(data, type=s)
assert len(arr_s) == 1
assert arr_s.type == s
assert repr(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
assert str(arr_s[0]) == "1970-01-01 00:00:01"
arr_ms = pa.array(data, type=ms)
assert len(arr_ms) == 1
assert arr_ms.type == ms
assert repr(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
assert str(arr_ms[0]) == "1970-01-01 00:00:00.001000"
arr_us = pa.array(data, type=us)
assert len(arr_us) == 1
assert arr_us.type == us
assert repr(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
assert str(arr_us[0]) == "1970-01-01 00:00:00.000001"
arr_ns = pa.array(data, type=ns)
assert len(arr_ns) == 1
assert arr_ns.type == ns
assert repr(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
assert str(arr_ns[0]) == "1970-01-01 00:00:00.000000001"
with pytest.raises(pa.ArrowException):
class CustomClass():
pass
pa.array([1, CustomClass()], type=ns)
pa.array([1, CustomClass()], type=pa.date32())
pa.array([1, CustomClass()], type=pa.date64())
def test_sequence_nesting_levels():
data = [1, 2, None]
arr = pa.array(data)
assert arr.type == pa.int64()
assert arr.to_pylist() == data
data = [[1], [2], None]
arr = pa.array(data)
assert arr.type == pa.list_(pa.int64())
assert arr.to_pylist() == data
data = [[1], [2, 3, 4], [None]]
arr = pa.array(data)
assert arr.type == pa.list_(pa.int64())
assert arr.to_pylist() == data
data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
arr = pa.array(data)
assert arr.type == pa.list_(pa.list_(pa.int64()))
assert arr.to_pylist() == data
exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)
# Mixed nesting levels are rejected
with pytest.raises(exceptions):
pa.array([1, 2, [1]])
with pytest.raises(exceptions):
pa.array([1, 2, []])
with pytest.raises(exceptions):
pa.array([[1], [2], [None, [1]]])
def test_sequence_mixed_types_fails():
data = ['a', 1, 2.0]
with pytest.raises(pa.ArrowTypeError):
pa.array(data)
def test_sequence_mixed_types_with_specified_type_fails():
data = ['-10', '-5', {'a': 1}, '0', '5', '10']
type = pa.string()
with pytest.raises(TypeError):
pa.array(data, type=type)
def test_sequence_decimal():
data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
type = pa.decimal128(precision=7, scale=3)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_different_precisions():
data = [
decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
]
type = pa.decimal128(precision=13, scale=3)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_no_scale():
data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
type = pa.decimal128(precision=10)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_negative():
data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
type = pa.decimal128(precision=10, scale=6)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_no_whole_part():
data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
type = pa.decimal128(precision=7, scale=7)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_large_integer():
data = [decimal.Decimal('-394029506937548693.42983'),
decimal.Decimal('32358695912932.01033')]
type = pa.decimal128(precision=23, scale=5)
arr = pa.array(data, type=type)
assert arr.to_pylist() == data
def test_sequence_decimal_from_integers():
data = [0, 1, -39402950693754869342983]
expected = [decimal.Decimal(x) for x in data]
type = pa.decimal128(precision=28, scale=5)
arr = pa.array(data, type=type)
assert arr.to_pylist() == expected
def test_range_types():
arr1 = pa.array(range(3))
arr2 = pa.array((0, 1, 2))
assert arr1.equals(arr2)
def test_empty_range():
arr = pa.array(range(0))
assert len(arr) == 0
assert arr.null_count == 0
assert arr.type == pa.null()
assert arr.to_pylist() == []
def test_structarray():
arr = pa.StructArray.from_arrays([], names=[])
assert arr.type == pa.struct([])
assert len(arr) == 0
assert arr.to_pylist() == []
ints = pa.array([None, 2, 3], type=pa.int64())
strs = pa.array([u'a', None, u'c'], type=pa.string())
bools = pa.array([True, False, None], type=pa.bool_())
arr = pa.StructArray.from_arrays(
[ints, strs, bools],
['ints', 'strs', 'bools'])
expected = [
{'ints': None, 'strs': u'a', 'bools': True},
{'ints': 2, 'strs': None, 'bools': False},
{'ints': 3, 'strs': u'c', 'bools': None},
]
pylist = arr.to_pylist()
assert pylist == expected, (pylist, expected)
# len(names) != len(arrays)
with pytest.raises(ValueError):
pa.StructArray.from_arrays([ints], ['ints', 'strs'])
def test_struct_from_dicts():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
arr = pa.array([], type=ty)
assert arr.to_pylist() == []
data = [{'a': 5, 'b': 'foo', 'c': True},
{'a': 6, 'b': 'bar', 'c': False}]
arr = pa.array(data, type=ty)
assert arr.to_pylist() == data
# With omitted values
data = [{'a': 5, 'c': True},
None,
{},
{'a': None, 'b': 'bar'}]
arr = pa.array(data, type=ty)
expected = [{'a': 5, 'b': None, 'c': True},
None,
{'a': None, 'b': None, 'c': None},
{'a': None, 'b': 'bar', 'c': None}]
assert arr.to_pylist() == expected
def test_struct_from_tuples():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
data = [(5, 'foo', True),
(6, 'bar', False)]
expected = [{'a': 5, 'b': 'foo', 'c': True},
{'a': 6, 'b': 'bar', 'c': False}]
arr = pa.array(data, type=ty)
data_as_ndarray = np.empty(len(data), dtype=object)
data_as_ndarray[:] = data
arr2 = pa.array(data_as_ndarray, type=ty)
assert arr.to_pylist() == expected
assert arr.equals(arr2)
# With omitted values
data = [(5, 'foo', None),
None,
(6, None, False)]
expected = [{'a': 5, 'b': 'foo', 'c': None},
None,
{'a': 6, 'b': None, 'c': False}]
arr = pa.array(data, type=ty)
assert arr.to_pylist() == expected
# Invalid tuple size
for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
with pytest.raises(ValueError, match="(?i)tuple size"):
pa.array([tup], type=ty)
def test_struct_from_mixed_sequence():
# It is forbidden to mix dicts and tuples when initializing a struct array
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
data = [(5, 'foo', True),
{'a': 6, 'b': 'bar', 'c': False}]
with pytest.raises(TypeError):
pa.array(data, type=ty)
def test_struct_from_dicts_inference():
expected_type = pa.struct([pa.field('a', pa.int64()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
data = [{'a': 5, 'b': u'foo', 'c': True},
{'a': 6, 'b': u'bar', 'c': False}]
arr = pa.array(data)
check_struct_type(arr.type, expected_type)
assert arr.to_pylist() == data
# With omitted values
data = [{'a': 5, 'c': True},
None,
{},
{'a': None, 'b': u'bar'}]
expected = [{'a': 5, 'b': None, 'c': True},
None,
{'a': None, 'b': None, 'c': None},
{'a': None, 'b': u'bar', 'c': None}]
arr = pa.array(data)
data_as_ndarray = np.empty(len(data), dtype=object)
data_as_ndarray[:] = data
arr2 = pa.array(data)
check_struct_type(arr.type, expected_type)
assert arr.to_pylist() == expected
assert arr.equals(arr2)
# Nested
expected_type = pa.struct([
pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
pa.field('ab', pa.bool_())])),
pa.field('b', pa.string())])
data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
{'a': {'aa': None, 'ab': False}, 'b': None},
{'a': None, 'b': 'bar'}]
arr = pa.array(data)
assert arr.to_pylist() == data
# Edge cases
arr = pa.array([{}])
assert arr.type == pa.struct([])
assert arr.to_pylist() == [{}]
# Mixing structs and scalars is rejected
with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
pa.array([1, {'a': 2}])
def test_structarray_from_arrays_coerce():
# ARROW-1706
ints = [None, 2, 3]
strs = [u'a', None, u'c']
bools = [True, False, None]
ints_nonnull = [1, 2, 3]
arrays = [ints, strs, bools, ints_nonnull]
result = pa.StructArray.from_arrays(arrays,
['ints', 'strs', 'bools',
'int_nonnull'])
expected = pa.StructArray.from_arrays(
[pa.array(ints, type='int64'),
pa.array(strs, type='utf8'),
pa.array(bools),
pa.array(ints_nonnull, type='int64')],
['ints', 'strs', 'bools', 'int_nonnull'])
with pytest.raises(ValueError):
pa.StructArray.from_arrays(arrays)
assert result.equals(expected)
def test_decimal_array_with_none_and_nan():
values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
array = pa.array(values)
assert array.type == pa.decimal128(4, 3)
assert array.to_pylist() == values[:2] + [None, None]
array = pa.array(values, type=pa.decimal128(10, 4))
assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
@pytest.mark.parametrize('tz,name', [
(pytz.FixedOffset(90), '+01:30'),
(pytz.FixedOffset(-90), '-01:30'),
(pytz.utc, 'UTC'),
(pytz.timezone('America/New_York'), 'America/New_York')
])
def test_timezone_string(tz, name):
assert pa.lib.tzinfo_to_string(tz) == name
assert pa.lib.string_to_tzinfo(name) == tz