| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from collections import OrderedDict |
| |
| import pickle |
| import pytest |
| import hypothesis as h |
| import hypothesis.strategies as st |
| |
| import numpy as np |
| import pyarrow as pa |
| import pyarrow.types as types |
| import pyarrow.tests.strategies as past |
| |
| |
| def get_many_types(): |
| # returning them from a function is required because of pa.dictionary |
| # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated |
| # checks that the default memory pool has zero allocated bytes |
| return ( |
| pa.null(), |
| pa.bool_(), |
| pa.int32(), |
| pa.time32('s'), |
| pa.time64('us'), |
| pa.date32(), |
| pa.timestamp('us'), |
| pa.timestamp('us', tz='UTC'), |
| pa.timestamp('us', tz='Europe/Paris'), |
| pa.float16(), |
| pa.float32(), |
| pa.float64(), |
| pa.decimal128(19, 4), |
| pa.string(), |
| pa.binary(), |
| pa.binary(10), |
| pa.list_(pa.int32()), |
| pa.struct([pa.field('a', pa.int32()), |
| pa.field('b', pa.int8()), |
| pa.field('c', pa.string())]), |
| pa.struct([pa.field('a', pa.int32(), nullable=False), |
| pa.field('b', pa.int8(), nullable=False), |
| pa.field('c', pa.string())]), |
| pa.union([pa.field('a', pa.binary(10)), |
| pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), |
| pa.union([pa.field('a', pa.binary(10)), |
| pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), |
| pa.union([pa.field('a', pa.binary(10), nullable=False), |
| pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), |
| pa.dictionary(pa.int32(), pa.string()) |
| ) |
| |
| |
| def test_is_boolean(): |
| assert types.is_boolean(pa.bool_()) |
| assert not types.is_boolean(pa.int8()) |
| |
| |
| def test_is_integer(): |
| signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] |
| unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] |
| |
| for t in signed_ints + unsigned_ints: |
| assert types.is_integer(t) |
| |
| for t in signed_ints: |
| assert types.is_signed_integer(t) |
| assert not types.is_unsigned_integer(t) |
| |
| for t in unsigned_ints: |
| assert types.is_unsigned_integer(t) |
| assert not types.is_signed_integer(t) |
| |
| assert not types.is_integer(pa.float32()) |
| assert not types.is_signed_integer(pa.float32()) |
| |
| |
| def test_is_floating(): |
| for t in [pa.float16(), pa.float32(), pa.float64()]: |
| assert types.is_floating(t) |
| |
| assert not types.is_floating(pa.int32()) |
| |
| |
| def test_is_null(): |
| assert types.is_null(pa.null()) |
| assert not types.is_null(pa.list_(pa.int32())) |
| |
| |
| def test_is_decimal(): |
| assert types.is_decimal(pa.decimal128(19, 4)) |
| assert not types.is_decimal(pa.int32()) |
| |
| |
| def test_is_list(): |
| assert types.is_list(pa.list_(pa.int32())) |
| assert not types.is_list(pa.int32()) |
| |
| |
| def test_is_dictionary(): |
| assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string())) |
| assert not types.is_dictionary(pa.int32()) |
| |
| |
| def test_is_nested_or_struct(): |
| struct_ex = pa.struct([pa.field('a', pa.int32()), |
| pa.field('b', pa.int8()), |
| pa.field('c', pa.string())]) |
| |
| assert types.is_struct(struct_ex) |
| assert not types.is_struct(pa.list_(pa.int32())) |
| |
| assert types.is_nested(struct_ex) |
| assert types.is_nested(pa.list_(pa.int32())) |
| assert not types.is_nested(pa.int32()) |
| |
| |
| def test_is_union(): |
| for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: |
| assert types.is_union(pa.union([pa.field('a', pa.int32()), |
| pa.field('b', pa.int8()), |
| pa.field('c', pa.string())], |
| mode=mode)) |
| assert not types.is_union(pa.list_(pa.int32())) |
| |
| |
| # TODO(wesm): is_map, once implemented |
| |
| |
| def test_is_binary_string(): |
| assert types.is_binary(pa.binary()) |
| assert not types.is_binary(pa.string()) |
| |
| assert types.is_string(pa.string()) |
| assert types.is_unicode(pa.string()) |
| assert not types.is_string(pa.binary()) |
| |
| assert types.is_fixed_size_binary(pa.binary(5)) |
| assert not types.is_fixed_size_binary(pa.binary()) |
| |
| |
| def test_is_temporal_date_time_timestamp(): |
| date_types = [pa.date32(), pa.date64()] |
| time_types = [pa.time32('s'), pa.time64('ns')] |
| timestamp_types = [pa.timestamp('ms')] |
| |
| for case in date_types + time_types + timestamp_types: |
| assert types.is_temporal(case) |
| |
| for case in date_types: |
| assert types.is_date(case) |
| assert not types.is_time(case) |
| assert not types.is_timestamp(case) |
| |
| for case in time_types: |
| assert types.is_time(case) |
| assert not types.is_date(case) |
| assert not types.is_timestamp(case) |
| |
| for case in timestamp_types: |
| assert types.is_timestamp(case) |
| assert not types.is_date(case) |
| assert not types.is_time(case) |
| |
| assert not types.is_temporal(pa.int32()) |
| |
| |
| def test_is_primitive(): |
| assert types.is_primitive(pa.int32()) |
| assert not types.is_primitive(pa.list_(pa.int32())) |
| |
| |
| def test_timestamp(): |
| for unit in ('s', 'ms', 'us', 'ns'): |
| for tz in (None, 'UTC', 'Europe/Paris'): |
| ty = pa.timestamp(unit, tz=tz) |
| assert ty.unit == unit |
| assert ty.tz == tz |
| |
| for invalid_unit in ('m', 'arbit', 'rary'): |
| with pytest.raises(ValueError, match='Invalid TimeUnit string'): |
| pa.timestamp(invalid_unit) |
| |
| |
| def test_time32_units(): |
| for valid_unit in ('s', 'ms'): |
| ty = pa.time32(valid_unit) |
| assert ty.unit == valid_unit |
| |
| for invalid_unit in ('m', 'us', 'ns'): |
| error_msg = 'Invalid TimeUnit for time32: {}'.format(invalid_unit) |
| with pytest.raises(ValueError, match=error_msg): |
| pa.time32(invalid_unit) |
| |
| |
| def test_time64_units(): |
| for valid_unit in ('us', 'ns'): |
| ty = pa.time64(valid_unit) |
| assert ty.unit == valid_unit |
| |
| for invalid_unit in ('m', 's', 'ms'): |
| error_msg = 'Invalid TimeUnit for time64: {}'.format(invalid_unit) |
| with pytest.raises(ValueError, match=error_msg): |
| pa.time64(invalid_unit) |
| |
| |
| def test_list_type(): |
| ty = pa.list_(pa.int64()) |
| assert ty.value_type == pa.int64() |
| |
| with pytest.raises(TypeError): |
| pa.list_(None) |
| |
| |
| def test_struct_type(): |
| fields = [ |
| # Duplicate field name on purpose |
| pa.field('a', pa.int64()), |
| pa.field('a', pa.int32()), |
| pa.field('b', pa.int32()) |
| ] |
| ty = pa.struct(fields) |
| |
| assert len(ty) == ty.num_children == 3 |
| assert list(ty) == fields |
| assert ty[0].name == 'a' |
| assert ty[2].type == pa.int32() |
| with pytest.raises(IndexError): |
| assert ty[3] |
| |
| assert ty['b'] == ty[2] |
| |
| # Duplicate |
| with pytest.warns(UserWarning): |
| with pytest.raises(KeyError): |
| ty['a'] |
| |
| # Not found |
| with pytest.raises(KeyError): |
| ty['c'] |
| |
| # Neither integer nor string |
| with pytest.raises(TypeError): |
| ty[None] |
| |
| for a, b in zip(ty, fields): |
| a == b |
| |
| # Construct from list of tuples |
| ty = pa.struct([('a', pa.int64()), |
| ('a', pa.int32()), |
| ('b', pa.int32())]) |
| assert list(ty) == fields |
| for a, b in zip(ty, fields): |
| a == b |
| |
| # Construct from mapping |
| fields = [pa.field('a', pa.int64()), |
| pa.field('b', pa.int32())] |
| ty = pa.struct(OrderedDict([('a', pa.int64()), |
| ('b', pa.int32())])) |
| assert list(ty) == fields |
| for a, b in zip(ty, fields): |
| a == b |
| |
| # Invalid args |
| with pytest.raises(TypeError): |
| pa.struct([('a', None)]) |
| |
| |
| def test_union_type(): |
| def check_fields(ty, fields): |
| assert ty.num_children == len(fields) |
| assert [ty[i] for i in range(ty.num_children)] == fields |
| |
| fields = [pa.field('x', pa.list_(pa.int32())), |
| pa.field('y', pa.binary())] |
| for mode in ('sparse', pa.lib.UnionMode_SPARSE): |
| ty = pa.union(fields, mode=mode) |
| assert ty.mode == 'sparse' |
| check_fields(ty, fields) |
| assert ty.type_codes == [0, 1] |
| for mode in ('dense', pa.lib.UnionMode_DENSE): |
| ty = pa.union(fields, mode=mode) |
| assert ty.mode == 'dense' |
| check_fields(ty, fields) |
| assert ty.type_codes == [0, 1] |
| for mode in ('unknown', 2): |
| with pytest.raises(ValueError, match='Invalid union mode'): |
| pa.union(fields, mode=mode) |
| |
| |
| def test_dictionary_type(): |
| ty0 = pa.dictionary(pa.int32(), pa.string()) |
| assert ty0.index_type == pa.int32() |
| assert ty0.value_type == pa.string() |
| assert ty0.ordered is False |
| |
| ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) |
| assert ty1.index_type == pa.int8() |
| assert ty1.value_type == pa.float64() |
| assert ty1.ordered is True |
| |
| # construct from non-arrow objects |
| ty2 = pa.dictionary('int8', 'string') |
| assert ty2.index_type == pa.int8() |
| assert ty2.value_type == pa.string() |
| assert ty2.ordered is False |
| |
| |
| def test_types_hashable(): |
| many_types = get_many_types() |
| in_dict = {} |
| for i, type_ in enumerate(many_types): |
| assert hash(type_) == hash(type_) |
| in_dict[type_] = i |
| assert len(in_dict) == len(many_types) |
| for i, type_ in enumerate(many_types): |
| assert in_dict[type_] == i |
| |
| |
| def test_types_picklable(): |
| for ty in get_many_types(): |
| data = pickle.dumps(ty) |
| assert pickle.loads(data) == ty |
| |
| |
| def test_fields_hashable(): |
| in_dict = {} |
| fields = [pa.field('a', pa.int32()), |
| pa.field('a', pa.int64()), |
| pa.field('a', pa.int64(), nullable=False), |
| pa.field('b', pa.int32()), |
| pa.field('b', pa.int32(), nullable=False)] |
| for i, field in enumerate(fields): |
| in_dict[field] = i |
| assert len(in_dict) == len(fields) |
| for i, field in enumerate(fields): |
| assert in_dict[field] == i |
| |
| |
| @pytest.mark.parametrize('t,check_func', [ |
| (pa.date32(), types.is_date32), |
| (pa.date64(), types.is_date64), |
| (pa.time32('s'), types.is_time32), |
| (pa.time64('ns'), types.is_time64), |
| (pa.int8(), types.is_int8), |
| (pa.int16(), types.is_int16), |
| (pa.int32(), types.is_int32), |
| (pa.int64(), types.is_int64), |
| (pa.uint8(), types.is_uint8), |
| (pa.uint16(), types.is_uint16), |
| (pa.uint32(), types.is_uint32), |
| (pa.uint64(), types.is_uint64), |
| (pa.float16(), types.is_float16), |
| (pa.float32(), types.is_float32), |
| (pa.float64(), types.is_float64) |
| ]) |
| def test_exact_primitive_types(t, check_func): |
| assert check_func(t) |
| |
| |
| def test_type_id(): |
| # enum values are not exposed publicly |
| for ty in get_many_types(): |
| assert isinstance(ty.id, int) |
| |
| |
| def test_bit_width(): |
| for ty, expected in [(pa.bool_(), 1), |
| (pa.int8(), 8), |
| (pa.uint32(), 32), |
| (pa.float16(), 16), |
| (pa.decimal128(19, 4), 128), |
| (pa.binary(42), 42 * 8)]: |
| assert ty.bit_width == expected |
| for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: |
| with pytest.raises(ValueError, match="fixed width"): |
| ty.bit_width |
| |
| |
| def test_fixed_size_binary_byte_width(): |
| ty = pa.binary(5) |
| assert ty.byte_width == 5 |
| |
| |
| def test_decimal_properties(): |
| ty = pa.decimal128(19, 4) |
| assert ty.byte_width == 16 |
| assert ty.precision == 19 |
| assert ty.scale == 4 |
| |
| |
| def test_decimal_overflow(): |
| pa.decimal128(1, 0) |
| pa.decimal128(38, 0) |
| for i in (0, -1, 39): |
| with pytest.raises(ValueError): |
| pa.decimal128(39, 0) |
| |
| |
| def test_type_equality_operators(): |
| many_types = get_many_types() |
| non_pyarrow = ('foo', 16, {'s', 'e', 't'}) |
| |
| for index, ty in enumerate(many_types): |
| # could use two parametrization levels, |
| # but that'd bloat pytest's output |
| for i, other in enumerate(many_types + non_pyarrow): |
| if i == index: |
| assert ty == other |
| else: |
| assert ty != other |
| |
| |
| def test_field_basic(): |
| t = pa.string() |
| f = pa.field('foo', t) |
| |
| assert f.name == 'foo' |
| assert f.nullable |
| assert f.type is t |
| assert repr(f) == "pyarrow.Field<foo: string>" |
| |
| f = pa.field('foo', t, False) |
| assert not f.nullable |
| |
| with pytest.raises(TypeError): |
| pa.field('foo', None) |
| |
| |
| def test_field_equals(): |
| meta1 = {b'foo': b'bar'} |
| meta2 = {b'bizz': b'bazz'} |
| |
| f1 = pa.field('a', pa.int8(), nullable=True) |
| f2 = pa.field('a', pa.int8(), nullable=True) |
| f3 = pa.field('a', pa.int8(), nullable=False) |
| f4 = pa.field('a', pa.int16(), nullable=False) |
| f5 = pa.field('b', pa.int16(), nullable=False) |
| f6 = pa.field('a', pa.int8(), nullable=True, metadata=meta1) |
| f7 = pa.field('a', pa.int8(), nullable=True, metadata=meta1) |
| f8 = pa.field('a', pa.int8(), nullable=True, metadata=meta2) |
| |
| assert f1.equals(f2) |
| assert f6.equals(f7) |
| assert not f1.equals(f3) |
| assert not f1.equals(f4) |
| assert not f3.equals(f4) |
| assert not f1.equals(f6) |
| assert not f4.equals(f5) |
| assert not f7.equals(f8) |
| |
| |
| def test_field_equality_operators(): |
| f1 = pa.field('a', pa.int8(), nullable=True) |
| f2 = pa.field('a', pa.int8(), nullable=True) |
| f3 = pa.field('b', pa.int8(), nullable=True) |
| f4 = pa.field('b', pa.int8(), nullable=False) |
| |
| assert f1 == f2 |
| assert f1 != f3 |
| assert f3 != f4 |
| assert f1 != 'foo' |
| |
| |
| def test_field_metadata(): |
| f1 = pa.field('a', pa.int8()) |
| f2 = pa.field('a', pa.int8(), metadata={}) |
| f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'}) |
| |
| assert f1.metadata is None |
| assert f2.metadata == {} |
| assert f3.metadata[b'bizz'] == b'bazz' |
| |
| |
| def test_field_add_remove_metadata(): |
| import collections |
| |
| f0 = pa.field('foo', pa.int32()) |
| |
| assert f0.metadata is None |
| |
| metadata = {b'foo': b'bar', b'pandas': b'badger'} |
| metadata2 = collections.OrderedDict([ |
| (b'a', b'alpha'), |
| (b'b', b'beta') |
| ]) |
| |
| f1 = f0.add_metadata(metadata) |
| assert f1.metadata == metadata |
| |
| f2 = f0.add_metadata(metadata2) |
| assert f2.metadata == metadata2 |
| |
| with pytest.raises(TypeError): |
| f0.add_metadata([1, 2, 3]) |
| |
| f3 = f1.remove_metadata() |
| assert f3.metadata is None |
| |
| # idempotent |
| f4 = f3.remove_metadata() |
| assert f4.metadata is None |
| |
| f5 = pa.field('foo', pa.int32(), True, metadata) |
| f6 = f0.add_metadata(metadata) |
| assert f5.equals(f6) |
| |
| |
| def test_is_integer_value(): |
| assert pa.types.is_integer_value(1) |
| assert pa.types.is_integer_value(np.int64(1)) |
| assert not pa.types.is_integer_value('1') |
| |
| |
| def test_is_float_value(): |
| assert not pa.types.is_float_value(1) |
| assert pa.types.is_float_value(1.) |
| assert pa.types.is_float_value(np.float64(1)) |
| assert not pa.types.is_float_value('1.0') |
| |
| |
| def test_is_boolean_value(): |
| assert not pa.types.is_boolean_value(1) |
| assert pa.types.is_boolean_value(True) |
| assert pa.types.is_boolean_value(False) |
| assert pa.types.is_boolean_value(np.bool_(True)) |
| assert pa.types.is_boolean_value(np.bool_(False)) |
| |
| |
| @h.given( |
| past.all_types | |
| past.all_fields | |
| past.all_schemas |
| ) |
| @h.example( |
| pa.field(name='', type=pa.null(), metadata={'0': '', '': ''}) |
| ) |
| def test_pickling(field): |
| data = pickle.dumps(field) |
| assert pickle.loads(data) == field |
| |
| |
| @h.given( |
| st.lists(past.all_types) | |
| st.lists(past.all_fields) | |
| st.lists(past.all_schemas) |
| ) |
| def test_hashing(items): |
| h.assume( |
| # well, this is still O(n^2), but makes the input unique |
| all(not a.equals(b) for i, a in enumerate(items) for b in items[:i]) |
| ) |
| |
| container = {} |
| for i, item in enumerate(items): |
| assert hash(item) == hash(item) |
| container[item] = i |
| |
| assert len(container) == len(items) |
| |
| for i, item in enumerate(items): |
| assert container[item] == i |