| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import datetime |
| |
| import pytz |
| import hypothesis as h |
| import hypothesis.strategies as st |
| import hypothesis.extra.numpy as npst |
| import hypothesis.extra.pytz as tzst |
| import numpy as np |
| |
| import pyarrow as pa |
| |
| |
| # TODO(kszucs): alphanum_text, surrogate_text |
| custom_text = st.text( |
| alphabet=st.characters( |
| min_codepoint=0x41, |
| max_codepoint=0x7E |
| ) |
| ) |
| |
| null_type = st.just(pa.null()) |
| bool_type = st.just(pa.bool_()) |
| |
| binary_type = st.just(pa.binary()) |
| string_type = st.just(pa.string()) |
| large_binary_type = st.just(pa.large_binary()) |
| large_string_type = st.just(pa.large_string()) |
| fixed_size_binary_type = st.builds( |
| pa.binary, |
| st.integers(min_value=0, max_value=16) |
| ) |
| binary_like_types = st.one_of( |
| binary_type, |
| string_type, |
| large_binary_type, |
| large_string_type, |
| fixed_size_binary_type |
| ) |
| |
| signed_integer_types = st.sampled_from([ |
| pa.int8(), |
| pa.int16(), |
| pa.int32(), |
| pa.int64() |
| ]) |
| unsigned_integer_types = st.sampled_from([ |
| pa.uint8(), |
| pa.uint16(), |
| pa.uint32(), |
| pa.uint64() |
| ]) |
| integer_types = st.one_of(signed_integer_types, unsigned_integer_types) |
| |
| floating_types = st.sampled_from([ |
| pa.float16(), |
| pa.float32(), |
| pa.float64() |
| ]) |
| decimal128_type = st.builds( |
| pa.decimal128, |
| precision=st.integers(min_value=1, max_value=38), |
| scale=st.integers(min_value=1, max_value=38) |
| ) |
| decimal256_type = st.builds( |
| pa.decimal256, |
| precision=st.integers(min_value=1, max_value=76), |
| scale=st.integers(min_value=1, max_value=76) |
| ) |
| numeric_types = st.one_of(integer_types, floating_types, |
| decimal128_type, decimal256_type) |
| |
| date_types = st.sampled_from([ |
| pa.date32(), |
| pa.date64() |
| ]) |
| time_types = st.sampled_from([ |
| pa.time32('s'), |
| pa.time32('ms'), |
| pa.time64('us'), |
| pa.time64('ns') |
| ]) |
| timestamp_types = st.builds( |
| pa.timestamp, |
| unit=st.sampled_from(['s', 'ms', 'us', 'ns']), |
| tz=tzst.timezones() |
| ) |
| duration_types = st.builds( |
| pa.duration, |
| st.sampled_from(['s', 'ms', 'us', 'ns']) |
| ) |
| interval_types = st.just(pa.month_day_nano_interval()) |
| temporal_types = st.one_of( |
| date_types, |
| time_types, |
| timestamp_types, |
| duration_types, |
| interval_types |
| ) |
| |
| primitive_types = st.one_of( |
| null_type, |
| bool_type, |
| numeric_types, |
| temporal_types, |
| binary_like_types |
| ) |
| |
| metadata = st.dictionaries(st.text(), st.text()) |
| |
| |
| @st.composite |
| def fields(draw, type_strategy=primitive_types): |
| name = draw(custom_text) |
| typ = draw(type_strategy) |
| if pa.types.is_null(typ): |
| nullable = True |
| else: |
| nullable = draw(st.booleans()) |
| meta = draw(metadata) |
| return pa.field(name, type=typ, nullable=nullable, metadata=meta) |
| |
| |
| def list_types(item_strategy=primitive_types): |
| return ( |
| st.builds(pa.list_, item_strategy) | |
| st.builds(pa.large_list, item_strategy) | |
| st.builds( |
| pa.list_, |
| item_strategy, |
| st.integers(min_value=0, max_value=16) |
| ) |
| ) |
| |
| |
| @st.composite |
| def struct_types(draw, item_strategy=primitive_types): |
| fields_strategy = st.lists(fields(item_strategy)) |
| fields_rendered = draw(fields_strategy) |
| field_names = [field.name for field in fields_rendered] |
| # check that field names are unique, see ARROW-9997 |
| h.assume(len(set(field_names)) == len(field_names)) |
| return pa.struct(fields_rendered) |
| |
| |
| def dictionary_types(key_strategy=None, value_strategy=None): |
| key_strategy = key_strategy or signed_integer_types |
| value_strategy = value_strategy or st.one_of( |
| bool_type, |
| integer_types, |
| st.sampled_from([pa.float32(), pa.float64()]), |
| binary_type, |
| string_type, |
| fixed_size_binary_type, |
| ) |
| return st.builds(pa.dictionary, key_strategy, value_strategy) |
| |
| |
| @st.composite |
| def map_types(draw, key_strategy=primitive_types, |
| item_strategy=primitive_types): |
| key_type = draw(key_strategy) |
| h.assume(not pa.types.is_null(key_type)) |
| value_type = draw(item_strategy) |
| return pa.map_(key_type, value_type) |
| |
| |
| # union type |
| # extension type |
| |
| |
| def schemas(type_strategy=primitive_types, max_fields=None): |
| children = st.lists(fields(type_strategy), max_size=max_fields) |
| return st.builds(pa.schema, children) |
| |
| |
| all_types = st.deferred( |
| lambda: ( |
| primitive_types | |
| list_types() | |
| struct_types() | |
| dictionary_types() | |
| map_types() | |
| list_types(all_types) | |
| struct_types(all_types) |
| ) |
| ) |
| all_fields = fields(all_types) |
| all_schemas = schemas(all_types) |
| |
| |
| _default_array_sizes = st.integers(min_value=0, max_value=20) |
| |
| |
| @st.composite |
| def _pylist(draw, value_type, size, nullable=True): |
| arr = draw(arrays(value_type, size=size, nullable=False)) |
| return arr.to_pylist() |
| |
| |
| @st.composite |
| def _pymap(draw, key_type, value_type, size, nullable=True): |
| length = draw(size) |
| keys = draw(_pylist(key_type, size=length, nullable=False)) |
| values = draw(_pylist(value_type, size=length, nullable=nullable)) |
| return list(zip(keys, values)) |
| |
| |
| @st.composite |
| def arrays(draw, type, size=None, nullable=True): |
| if isinstance(type, st.SearchStrategy): |
| ty = draw(type) |
| elif isinstance(type, pa.DataType): |
| ty = type |
| else: |
| raise TypeError('Type must be a pyarrow DataType') |
| |
| if isinstance(size, st.SearchStrategy): |
| size = draw(size) |
| elif size is None: |
| size = draw(_default_array_sizes) |
| elif not isinstance(size, int): |
| raise TypeError('Size must be an integer') |
| |
| if pa.types.is_null(ty): |
| h.assume(nullable) |
| value = st.none() |
| elif pa.types.is_boolean(ty): |
| value = st.booleans() |
| elif pa.types.is_integer(ty): |
| values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,))) |
| return pa.array(values, type=ty) |
| elif pa.types.is_floating(ty): |
| values = draw(npst.arrays(ty.to_pandas_dtype(), shape=(size,))) |
| # Workaround ARROW-4952: no easy way to assert array equality |
| # in a NaN-tolerant way. |
| values[np.isnan(values)] = -42.0 |
| return pa.array(values, type=ty) |
| elif pa.types.is_decimal(ty): |
| # TODO(kszucs): properly limit the precision |
| # value = st.decimals(places=type.scale, allow_infinity=False) |
| h.reject() |
| elif pa.types.is_time(ty): |
| value = st.times() |
| elif pa.types.is_date(ty): |
| value = st.dates() |
| elif pa.types.is_timestamp(ty): |
| min_int64 = -(2**63) |
| max_int64 = 2**63 - 1 |
| min_datetime = datetime.datetime.fromtimestamp(min_int64 // 10**9) |
| max_datetime = datetime.datetime.fromtimestamp(max_int64 // 10**9) |
| try: |
| offset_hours = int(ty.tz) |
| tz = pytz.FixedOffset(offset_hours * 60) |
| except ValueError: |
| tz = pytz.timezone(ty.tz) |
| value = st.datetimes(timezones=st.just(tz), min_value=min_datetime, |
| max_value=max_datetime) |
| elif pa.types.is_duration(ty): |
| value = st.timedeltas() |
| elif pa.types.is_binary(ty) or pa.types.is_large_binary(ty): |
| value = st.binary() |
| elif pa.types.is_string(ty) or pa.types.is_large_string(ty): |
| value = st.text() |
| elif pa.types.is_fixed_size_binary(ty): |
| value = st.binary(min_size=ty.byte_width, max_size=ty.byte_width) |
| elif pa.types.is_list(ty): |
| value = _pylist(ty.value_type, size=size, nullable=nullable) |
| elif pa.types.is_large_list(ty): |
| value = _pylist(ty.value_type, size=size, nullable=nullable) |
| elif pa.types.is_fixed_size_list(ty): |
| value = _pylist(ty.value_type, size=ty.list_size, nullable=nullable) |
| elif pa.types.is_dictionary(ty): |
| values = _pylist(ty.value_type, size=size, nullable=nullable) |
| return pa.array(draw(values), type=ty) |
| elif pa.types.is_map(ty): |
| value = _pymap(ty.key_type, ty.item_type, size=_default_array_sizes, |
| nullable=nullable) |
| elif pa.types.is_struct(ty): |
| h.assume(len(ty) > 0) |
| fields, child_arrays = [], [] |
| for field in ty: |
| fields.append(field) |
| child_arrays.append(draw(arrays(field.type, size=size))) |
| return pa.StructArray.from_arrays(child_arrays, fields=fields) |
| else: |
| raise NotImplementedError(ty) |
| |
| if nullable: |
| value = st.one_of(st.none(), value) |
| values = st.lists(value, min_size=size, max_size=size) |
| |
| return pa.array(draw(values), type=ty) |
| |
| |
| @st.composite |
| def chunked_arrays(draw, type, min_chunks=0, max_chunks=None, chunk_size=None): |
| if isinstance(type, st.SearchStrategy): |
| type = draw(type) |
| |
| # TODO(kszucs): remove it, field metadata is not kept |
| h.assume(not pa.types.is_struct(type)) |
| |
| chunk = arrays(type, size=chunk_size) |
| chunks = st.lists(chunk, min_size=min_chunks, max_size=max_chunks) |
| |
| return pa.chunked_array(draw(chunks), type=type) |
| |
| |
| @st.composite |
| def record_batches(draw, type, rows=None, max_fields=None): |
| if isinstance(rows, st.SearchStrategy): |
| rows = draw(rows) |
| elif rows is None: |
| rows = draw(_default_array_sizes) |
| elif not isinstance(rows, int): |
| raise TypeError('Rows must be an integer') |
| |
| schema = draw(schemas(type, max_fields=max_fields)) |
| children = [draw(arrays(field.type, size=rows)) for field in schema] |
| # TODO(kszucs): the names and schema arguments are not consistent with |
| # Table.from_array's arguments |
| return pa.RecordBatch.from_arrays(children, names=schema) |
| |
| |
| @st.composite |
| def tables(draw, type, rows=None, max_fields=None): |
| if isinstance(rows, st.SearchStrategy): |
| rows = draw(rows) |
| elif rows is None: |
| rows = draw(_default_array_sizes) |
| elif not isinstance(rows, int): |
| raise TypeError('Rows must be an integer') |
| |
| schema = draw(schemas(type, max_fields=max_fields)) |
| children = [draw(arrays(field.type, size=rows)) for field in schema] |
| return pa.Table.from_arrays(children, schema=schema) |
| |
| |
| all_arrays = arrays(all_types) |
| all_chunked_arrays = chunked_arrays(all_types) |
| all_record_batches = record_batches(all_types) |
| all_tables = tables(all_types) |
| |
| |
| # Define the same rules as above for pandas tests by excluding certain types |
| # from the generation because of known issues. |
| |
| pandas_compatible_primitive_types = st.one_of( |
| null_type, |
| bool_type, |
| integer_types, |
| st.sampled_from([pa.float32(), pa.float64()]), |
| decimal128_type, |
| date_types, |
| time_types, |
| # Need to exclude timestamp and duration types otherwise hypothesis |
| # discovers ARROW-10210 |
| # timestamp_types, |
| # duration_types |
| interval_types, |
| binary_type, |
| string_type, |
| large_binary_type, |
| large_string_type, |
| ) |
| |
| # Need to exclude floating point types otherwise hypothesis discovers |
| # ARROW-10211 |
| pandas_compatible_dictionary_value_types = st.one_of( |
| bool_type, |
| integer_types, |
| binary_type, |
| string_type, |
| fixed_size_binary_type, |
| ) |
| |
| |
| def pandas_compatible_list_types( |
| item_strategy=pandas_compatible_primitive_types |
| ): |
| # Need to exclude fixed size list type otherwise hypothesis discovers |
| # ARROW-10194 |
| return ( |
| st.builds(pa.list_, item_strategy) | |
| st.builds(pa.large_list, item_strategy) |
| ) |
| |
| |
| pandas_compatible_types = st.deferred( |
| lambda: st.one_of( |
| pandas_compatible_primitive_types, |
| pandas_compatible_list_types(pandas_compatible_primitive_types), |
| struct_types(pandas_compatible_primitive_types), |
| dictionary_types( |
| value_strategy=pandas_compatible_dictionary_value_types |
| ), |
| pandas_compatible_list_types(pandas_compatible_types), |
| struct_types(pandas_compatible_types) |
| ) |
| ) |