| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from collections import namedtuple |
| import datetime |
| import decimal |
| from functools import lru_cache, partial |
| import inspect |
| import itertools |
| import math |
| import os |
| import pytest |
| import random |
| import sys |
| import textwrap |
| |
| try: |
| import numpy as np |
| except ImportError: |
| np = None |
| |
| try: |
| import pandas as pd |
| except ImportError: |
| pd = None |
| |
| import pyarrow as pa |
| import pyarrow.compute as pc |
| from pyarrow.lib import ArrowNotImplementedError, ArrowIndexError |
| |
| try: |
| import pyarrow.substrait as pas |
| except ImportError: |
| pas = None |
| |
| exported_functions = [ |
| func for (name, func) in sorted(pc.__dict__.items()) |
| if hasattr(func, '__arrow_compute_function__')] |
| |
| exported_option_classes = [ |
| cls for (name, cls) in sorted(pc.__dict__.items()) |
| if (isinstance(cls, type) and |
| cls is not pc.FunctionOptions and |
| issubclass(cls, pc.FunctionOptions))] |
| |
| numerical_arrow_types = [ |
| pa.int8(), |
| pa.int16(), |
| pa.int64(), |
| pa.uint8(), |
| pa.uint16(), |
| pa.uint64(), |
| pa.float32(), |
| pa.float64() |
| ] |
| |
| |
| all_array_types = [ |
| ('bool', [True, False, False, True, True]), |
| ('uint8', range(5)), |
| ('int8', range(5)), |
| ('uint16', range(5)), |
| ('int16', range(5)), |
| ('uint32', range(5)), |
| ('int32', range(5)), |
| ('uint64', range(5, 10)), |
| ('int64', range(5, 10)), |
| ('float', [0, 0.1, 0.2, 0.3, 0.4]), |
| ('double', [0, 0.1, 0.2, 0.3, 0.4]), |
| ('string', ['a', 'b', None, 'ddd', 'ee']), |
| ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), |
| (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), |
| (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), |
| (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), |
| (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ |
| {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), |
| ] |
| |
| |
| def test_exported_functions(): |
| # Check that all exported concrete functions can be called with |
| # the right number of arguments. |
| # Note that unregistered functions (e.g. with a mismatching name) |
| # will raise KeyError. |
| functions = exported_functions |
| assert len(functions) >= 10 |
| for func in functions: |
| desc = func.__arrow_compute_function__ |
| if desc['options_required']: |
| # Skip this function as it will fail with a different error |
| # message if we don't pass an options instance. |
| continue |
| arity = desc['arity'] |
| if arity == 0: |
| continue |
| if arity is Ellipsis: |
| args = [object()] * 3 |
| else: |
| args = [object()] * arity |
| with pytest.raises(TypeError, |
| match="Got unexpected argument type " |
| "<class 'object'> for compute function"): |
| func(*args) |
| |
| |
| def test_hash_aggregate_not_exported(): |
| # Ensure we are not leaking hash aggregate functions |
| # which are not callable by themselves. |
| for func in exported_functions: |
| arrow_f = pc.get_function(func.__arrow_compute_function__["name"]) |
| assert arrow_f.kind != "hash_aggregate" |
| |
| |
| def test_exported_option_classes(): |
| classes = exported_option_classes |
| assert len(classes) >= 10 |
| for cls in classes: |
| # Option classes must have an introspectable constructor signature, |
| # and that signature should not have any *args or **kwargs. |
| sig = inspect.signature(cls) |
| for param in sig.parameters.values(): |
| assert param.kind not in (param.VAR_POSITIONAL, |
| param.VAR_KEYWORD) |
| |
| |
| @pytest.mark.filterwarnings( |
| "ignore:pyarrow.CumulativeSumOptions is deprecated as of 14.0" |
| ) |
| def test_option_class_equality(request): |
| options = [ |
| pc.ArraySortOptions(), |
| pc.AssumeTimezoneOptions("UTC"), |
| pc.CastOptions.safe(pa.int8()), |
| pc.CumulativeOptions(start=None, skip_nulls=False), |
| pc.CountOptions(), |
| pc.DayOfWeekOptions(count_from_zero=False, week_start=0), |
| pc.DictionaryEncodeOptions(), |
| pc.RunEndEncodeOptions(), |
| pc.ElementWiseAggregateOptions(skip_nulls=True), |
| pc.ExtractRegexOptions("pattern"), |
| pc.ExtractRegexSpanOptions("pattern"), |
| pc.FilterOptions(), |
| pc.IndexOptions(pa.scalar(1)), |
| pc.JoinOptions(), |
| pc.ListSliceOptions(0, -1, 1, True), |
| pc.ListFlattenOptions(recursive=False), |
| pc.MakeStructOptions(["field", "names"], |
| field_nullability=[True, True], |
| field_metadata=[pa.KeyValueMetadata({"a": "1"}), |
| pa.KeyValueMetadata({"b": "2"})]), |
| pc.MapLookupOptions(pa.scalar(1), "first"), |
| pc.MatchSubstringOptions("pattern"), |
| pc.ModeOptions(), |
| pc.NullOptions(), |
| pc.PadOptions(5), |
| pc.PairwiseOptions(period=1), |
| pc.PartitionNthOptions(1, null_placement="at_start"), |
| pc.PivotWiderOptions(["height"], unexpected_key_behavior="raise"), |
| pc.QuantileOptions(), |
| pc.RandomOptions(), |
| pc.RankOptions(sort_keys="ascending", |
| null_placement="at_start", tiebreaker="max"), |
| pc.RankQuantileOptions(sort_keys="ascending", |
| null_placement="at_start"), |
| pc.ReplaceSliceOptions(0, 1, "a"), |
| pc.ReplaceSubstringOptions("a", "b"), |
| pc.RoundOptions(2, "towards_infinity"), |
| pc.RoundBinaryOptions("towards_infinity"), |
| pc.RoundTemporalOptions(1, "second", week_starts_monday=True), |
| pc.RoundToMultipleOptions(100, "towards_infinity"), |
| pc.ScalarAggregateOptions(), |
| pc.SelectKOptions(0, sort_keys=[("b", "ascending")]), |
| pc.SetLookupOptions(pa.array([1])), |
| pc.SkewOptions(min_count=2), |
| pc.SliceOptions(0, 1, 1), |
| pc.SortOptions([("dummy", "descending")], null_placement="at_start"), |
| pc.SplitOptions(), |
| pc.SplitPatternOptions("pattern"), |
| pc.StrftimeOptions(), |
| pc.StrptimeOptions("%Y", "s", True), |
| pc.StructFieldOptions(indices=[]), |
| pc.TakeOptions(), |
| pc.TDigestOptions(), |
| pc.TrimOptions(" "), |
| pc.Utf8NormalizeOptions("NFKC"), |
| pc.VarianceOptions(), |
| pc.WinsorizeOptions(0.05, 0.9), |
| pc.WeekOptions(week_starts_monday=True, count_from_zero=False, |
| first_week_is_fully_in_year=False), |
| pc.ZeroFillOptions(4, "0"), |
| ] |
| # Timezone database might not be installed on Windows or Emscripten |
| if request.config.pyarrow.is_enabled["timezone_data"]: |
| options.append(pc.AssumeTimezoneOptions("Europe/Ljubljana")) |
| |
| classes = {type(option) for option in options} |
| |
| for cls in exported_option_classes: |
| # Timezone database might not be installed on Windows or Emscripten |
| if ( |
| cls not in classes |
| and (request.config.pyarrow.is_enabled["timezone_data"]) |
| and cls != pc.AssumeTimezoneOptions |
| ): |
| try: |
| options.append(cls()) |
| except TypeError: |
| pytest.fail(f"Options class is not tested: {cls}") |
| |
| for option in options: |
| assert option == option |
| assert repr(option).startswith(option.__class__.__name__) |
| buf = option.serialize() |
| deserialized = pc.FunctionOptions.deserialize(buf) |
| assert option == deserialized |
| # TODO remove the check under the if statement and the filterwarnings |
| # mark when the deprecated class CumulativeSumOptions is removed. |
| if repr(option).startswith("CumulativeSumOptions"): |
| assert repr(deserialized).startswith("CumulativeOptions") |
| else: |
| assert repr(option) == repr(deserialized) |
| for option1, option2 in zip(options, options[1:]): |
| assert option1 != option2 |
| |
| assert repr(pc.IndexOptions(pa.scalar(1))) == "IndexOptions(value=int64:1)" |
| assert repr(pc.ArraySortOptions()) == \ |
| "ArraySortOptions(order=Ascending, null_placement=AtEnd)" |
| |
| |
| def test_list_functions(): |
| assert len(pc.list_functions()) > 10 |
| assert "add" in pc.list_functions() |
| |
| |
| def _check_get_function(name, expected_func_cls, expected_ker_cls, |
| min_num_kernels=1): |
| func = pc.get_function(name) |
| assert isinstance(func, expected_func_cls) |
| n = func.num_kernels |
| assert n >= min_num_kernels |
| assert n == len(func.kernels) |
| assert all(isinstance(ker, expected_ker_cls) for ker in func.kernels) |
| |
| |
| def test_get_function_scalar(): |
| _check_get_function("add", pc.ScalarFunction, pc.ScalarKernel, 8) |
| |
| |
| def test_get_function_vector(): |
| _check_get_function("unique", pc.VectorFunction, pc.VectorKernel, 8) |
| |
| |
| def test_get_function_scalar_aggregate(): |
| _check_get_function("mean", pc.ScalarAggregateFunction, |
| pc.ScalarAggregateKernel, 8) |
| |
| |
| def test_get_function_hash_aggregate(): |
| _check_get_function("hash_sum", pc.HashAggregateFunction, |
| pc.HashAggregateKernel, 1) |
| |
| |
| @pytest.mark.numpy |
| def test_call_function_with_memory_pool(): |
| arr = pa.array(["foo", "bar", "baz"]) |
| indices = np.array([2, 2, 1]) |
| result1 = arr.take(indices) |
| result2 = pc.call_function('take', [arr, indices], |
| memory_pool=pa.default_memory_pool()) |
| expected = pa.array(["baz", "baz", "bar"]) |
| assert result1.equals(expected) |
| assert result2.equals(expected) |
| |
| result3 = pc.take(arr, indices, memory_pool=pa.default_memory_pool()) |
| assert result3.equals(expected) |
| |
| |
| def test_pickle_functions(pickle_module): |
| # Pickle registered functions |
| for name in pc.list_functions(): |
| func = pc.get_function(name) |
| reconstructed = pickle_module.loads(pickle_module.dumps(func)) |
| assert type(reconstructed) is type(func) |
| assert reconstructed.name == func.name |
| assert reconstructed.arity == func.arity |
| assert reconstructed.num_kernels == func.num_kernels |
| |
| |
| def test_pickle_global_functions(pickle_module): |
| # Pickle global wrappers (manual or automatic) of registered functions |
| for name in pc.list_functions(): |
| try: |
| func = getattr(pc, name) |
| except AttributeError: |
| # hash_aggregate functions are not exported as callables. |
| continue |
| reconstructed = pickle_module.loads(pickle_module.dumps(func)) |
| assert reconstructed is func |
| |
| |
| def test_function_attributes(): |
| # Sanity check attributes of registered functions |
| for name in pc.list_functions(): |
| func = pc.get_function(name) |
| assert isinstance(func, pc.Function) |
| assert func.name == name |
| kernels = func.kernels |
| assert func.num_kernels == len(kernels) |
| assert all(isinstance(ker, pc.Kernel) for ker in kernels) |
| repr(func) |
| for ker in kernels: |
| repr(ker) |
| |
| |
| def test_input_type_conversion(): |
| # Automatic array conversion from Python |
| arr = pc.add([1, 2], [4, None]) |
| assert arr.to_pylist() == [5, None] |
| # Automatic scalar conversion from Python |
| arr = pc.add([1, 2], 4) |
| assert arr.to_pylist() == [5, 6] |
| # Other scalar type |
| assert pc.equal(["foo", "bar", None], |
| "foo").to_pylist() == [True, False, None] |
| |
| |
| @pytest.mark.parametrize('arrow_type', numerical_arrow_types) |
| def test_sum_array(arrow_type): |
| arr = pa.array([1, 2, 3, 4], type=arrow_type) |
| assert arr.sum().as_py() == 10 |
| assert pc.sum(arr).as_py() == 10 |
| |
| arr = pa.array([1, 2, 3, 4, None], type=arrow_type) |
| assert arr.sum().as_py() == 10 |
| assert pc.sum(arr).as_py() == 10 |
| |
| arr = pa.array([None], type=arrow_type) |
| assert arr.sum().as_py() is None # noqa: E711 |
| assert pc.sum(arr).as_py() is None # noqa: E711 |
| assert arr.sum(min_count=0).as_py() == 0 |
| assert pc.sum(arr, min_count=0).as_py() == 0 |
| |
| arr = pa.array([], type=arrow_type) |
| assert arr.sum().as_py() is None # noqa: E711 |
| assert arr.sum(min_count=0).as_py() == 0 |
| assert pc.sum(arr, min_count=0).as_py() == 0 |
| |
| |
| @pytest.mark.parametrize("arrow_type", [pa.decimal128(3, 2), pa.decimal256(3, 2)]) |
| def test_sum_decimal_array(arrow_type): |
| from decimal import Decimal |
| max_precision_type = ( |
| pa.decimal128(38, arrow_type.scale) |
| if pa.types.is_decimal128(arrow_type) |
| else pa.decimal256(76, arrow_type.scale) |
| ) |
| expected_sum = Decimal("5.79") |
| expected_sum_overflow = Decimal("10.00") |
| zero = Decimal("0.00") |
| |
| # No overflow |
| arr = pa.array([Decimal("1.23"), Decimal("4.56")], type=arrow_type) |
| assert arr.sum().as_py() == expected_sum |
| assert arr.sum().type == max_precision_type |
| |
| arr = pa.array([Decimal("1.23"), Decimal("4.56"), None], type=arrow_type) |
| assert arr.sum().as_py() == expected_sum |
| assert arr.sum().type == max_precision_type |
| |
| # With overflow |
| arr = pa.array([Decimal("1.23"), Decimal("8.77")], type=arrow_type) |
| assert arr.sum().as_py() == expected_sum_overflow |
| assert arr.sum().type == max_precision_type |
| |
| arr = pa.array([Decimal("1.23"), Decimal("8.77"), None], type=arrow_type) |
| assert arr.sum().as_py() == expected_sum_overflow |
| assert arr.sum().type == max_precision_type |
| |
| arr = pa.array([None], type=arrow_type) |
| assert arr.sum().as_py() is None # noqa: E711 |
| assert arr.sum().type == max_precision_type # noqa: E711 |
| assert arr.sum(min_count=0).as_py() == zero |
| assert arr.sum(min_count=0).type == max_precision_type |
| |
| arr = pa.array([], type=arrow_type) |
| assert arr.sum().as_py() is None # noqa: E711 |
| assert arr.sum().type == max_precision_type # noqa: E711 |
| assert arr.sum(min_count=0).as_py() == zero |
| assert arr.sum(min_count=0).type == max_precision_type |
| |
| |
| @pytest.mark.parametrize('arrow_type', numerical_arrow_types) |
| def test_sum_chunked_array(arrow_type): |
| arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)]) |
| assert pc.sum(arr).as_py() == 10 |
| |
| arr = pa.chunked_array([ |
| pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type) |
| ]) |
| assert pc.sum(arr).as_py() == 10 |
| |
| arr = pa.chunked_array([ |
| pa.array([1, 2], type=arrow_type), |
| pa.array([], type=arrow_type), |
| pa.array([3, 4], type=arrow_type) |
| ]) |
| assert pc.sum(arr).as_py() == 10 |
| |
| arr = pa.chunked_array((), type=arrow_type) |
| assert arr.num_chunks == 0 |
| assert pc.sum(arr).as_py() is None # noqa: E711 |
| assert pc.sum(arr, min_count=0).as_py() == 0 |
| |
| |
| @pytest.mark.parametrize('arrow_type', [pa.decimal128(3, 2), pa.decimal256(3, 2)]) |
| def test_sum_chunked_array_decimal_type(arrow_type): |
| from decimal import Decimal |
| max_precision_type = ( |
| pa.decimal128(38, arrow_type.scale) |
| if pa.types.is_decimal128(arrow_type) |
| else pa.decimal256(76, arrow_type.scale) |
| ) |
| expected_sum = Decimal("5.79") |
| zero = Decimal("0.00") |
| |
| arr = pa.chunked_array( |
| [ |
| pa.array([Decimal("1.23"), Decimal("4.56")], type=arrow_type) |
| ] |
| ) |
| assert pc.sum(arr).as_py() == expected_sum |
| assert pc.sum(arr).type == max_precision_type |
| |
| arr = pa.chunked_array([ |
| pa.array([Decimal("1.23")], type=arrow_type), |
| pa.array([Decimal("4.56")], type=arrow_type) |
| ]) |
| assert pc.sum(arr).as_py() == expected_sum |
| assert pc.sum(arr).type == max_precision_type |
| |
| arr = pa.chunked_array([ |
| pa.array([Decimal("1.23")], type=arrow_type), |
| pa.array([], type=arrow_type), |
| pa.array([Decimal("4.56")], type=arrow_type) |
| ]) |
| assert pc.sum(arr).as_py() == expected_sum |
| assert pc.sum(arr).type == max_precision_type |
| |
| arr = pa.chunked_array((), type=arrow_type) |
| assert arr.num_chunks == 0 |
| assert pc.sum(arr).as_py() is None # noqa: E711 |
| assert pc.sum(arr).type == max_precision_type |
| assert pc.sum(arr, min_count=0).as_py() == zero |
| assert pc.sum(arr, min_count=0).type == max_precision_type |
| |
| |
| def test_mode_array(): |
| # ARROW-9917 |
| arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') |
| mode = pc.mode(arr) |
| assert len(mode) == 1 |
| assert mode[0].as_py() == {"mode": 1, "count": 2} |
| |
| mode = pc.mode(arr, n=2) |
| assert len(mode) == 2 |
| assert mode[0].as_py() == {"mode": 1, "count": 2} |
| assert mode[1].as_py() == {"mode": 3, "count": 2} |
| |
| arr = pa.array([], type='int64') |
| assert len(pc.mode(arr)) == 0 |
| |
| arr = pa.array([1, 1, 3, 4, 3, None], type='int64') |
| mode = pc.mode(arr, skip_nulls=False) |
| assert len(mode) == 0 |
| mode = pc.mode(arr, min_count=6) |
| assert len(mode) == 0 |
| mode = pc.mode(arr, skip_nulls=False, min_count=5) |
| assert len(mode) == 0 |
| |
| arr = pa.array([True, False]) |
| mode = pc.mode(arr, n=2) |
| assert len(mode) == 2 |
| assert mode[0].as_py() == {"mode": False, "count": 1} |
| assert mode[1].as_py() == {"mode": True, "count": 1} |
| |
| |
| def test_mode_chunked_array(): |
| # ARROW-9917 |
| arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) |
| mode = pc.mode(arr) |
| assert len(mode) == 1 |
| assert mode[0].as_py() == {"mode": 1, "count": 2} |
| |
| mode = pc.mode(arr, n=2) |
| assert len(mode) == 2 |
| assert mode[0].as_py() == {"mode": 1, "count": 2} |
| assert mode[1].as_py() == {"mode": 3, "count": 2} |
| |
| arr = pa.chunked_array((), type='int64') |
| assert arr.num_chunks == 0 |
| assert len(pc.mode(arr)) == 0 |
| |
| |
| def test_empty_chunked_array(): |
| msg = "cannot construct ChunkedArray from empty vector and omitted type" |
| with pytest.raises(pa.ArrowInvalid, match=msg): |
| pa.chunked_array([]) |
| |
| pa.chunked_array([], type=pa.int8()) |
| |
| |
| def test_variance(): |
| data = [1, 2, 3, 4, 5, 6, 7, 8] |
| assert pc.variance(data).as_py() == 5.25 |
| assert pc.variance(data, ddof=0).as_py() == 5.25 |
| assert pc.variance(data, ddof=1).as_py() == 6.0 |
| |
| |
| def test_skew(): |
| data = [1, 1, None, 2] |
| assert pc.skew(data).as_py() == pytest.approx(0.707106781186548, rel=1e-10) |
| assert pc.skew(data, skip_nulls=False).as_py() is None |
| assert pc.skew(data, min_count=4).as_py() is None |
| |
| |
| def test_kurtosis(): |
| data = [1, 1, None, 2] |
| assert pc.kurtosis(data).as_py() == pytest.approx(-1.5, rel=1e-10) |
| assert pc.kurtosis(data, skip_nulls=False).as_py() is None |
| assert pc.kurtosis(data, min_count=4).as_py() is None |
| |
| |
| @pytest.mark.parametrize("input, expected", ( |
| ( |
| [1.0, 2.0, 3.0, 40.0, None], |
| {'skew': pytest.approx(1.988947740397821), |
| 'kurtosis': pytest.approx(3.9631931024230695)} |
| ), |
| ([1, 2, 40], {'skew': pytest.approx(1.7281098503730385), 'kurtosis': None}), |
| ([1, 40], {'skew': None, 'kurtosis': None}), |
| )) |
| def test_unbiased_skew_and_kurtosis(input, expected): |
| arrow_skew = pc.skew(input, skip_nulls=True, biased=False) |
| arrow_kurtosis = pc.kurtosis(input, skip_nulls=True, biased=False) |
| assert arrow_skew.as_py() == expected['skew'] |
| assert arrow_kurtosis.as_py() == expected['kurtosis'] |
| |
| |
| def test_count_substring(): |
| for (ty, offset) in [(pa.string(), pa.int32()), |
| (pa.large_string(), pa.int64())]: |
| arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty) |
| |
| result = pc.count_substring(arr, "ab") |
| expected = pa.array([1, 1, 2, 0, 0, None], type=offset) |
| assert expected == result |
| |
| result = pc.count_substring(arr, "ab", ignore_case=True) |
| expected = pa.array([1, 1, 2, 0, 1, None], type=offset) |
| assert expected == result |
| |
| |
| def test_count_substring_regex(): |
| for (ty, offset) in [(pa.string(), pa.int32()), |
| (pa.large_string(), pa.int64())]: |
| arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty) |
| |
| result = pc.count_substring_regex(arr, "a+") |
| expected = pa.array([1, 1, 3, 1, 0, None], type=offset) |
| assert expected.equals(result) |
| |
| result = pc.count_substring_regex(arr, "a+", ignore_case=True) |
| expected = pa.array([1, 1, 2, 1, 1, None], type=offset) |
| assert expected.equals(result) |
| |
| |
| def test_find_substring(): |
| for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]: |
| arr = pa.array(["ab", "cab", "ba", None], type=ty) |
| result = pc.find_substring(arr, "ab") |
| assert result.to_pylist() == [0, 1, -1, None] |
| |
| result = pc.find_substring_regex(arr, "a?b") |
| assert result.to_pylist() == [0, 1, 0, None] |
| |
| arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty) |
| result = pc.find_substring(arr, "aB*", ignore_case=True) |
| assert result.to_pylist() == [0, 1, -1, -1] |
| |
| result = pc.find_substring_regex(arr, "a?b", ignore_case=True) |
| assert result.to_pylist() == [0, 1, 0, 0] |
| |
| |
| def test_match_like(): |
| arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) |
| result = pc.match_like(arr, r"_a\%%") |
| expected = pa.array([False, True, False, True, None]) |
| assert expected.equals(result) |
| |
| arr = pa.array(["aB", "bA%", "ba", "ca%d", None]) |
| result = pc.match_like(arr, r"_a\%%", ignore_case=True) |
| expected = pa.array([False, True, False, True, None]) |
| assert expected.equals(result) |
| result = pc.match_like(arr, r"_a\%%", ignore_case=False) |
| expected = pa.array([False, False, False, True, None]) |
| assert expected.equals(result) |
| |
| |
| def test_match_substring(): |
| arr = pa.array(["ab", "abc", "ba", None]) |
| result = pc.match_substring(arr, "ab") |
| expected = pa.array([True, True, False, None]) |
| assert expected.equals(result) |
| |
| arr = pa.array(["áB", "Ábc", "ba", None]) |
| result = pc.match_substring(arr, "áb", ignore_case=True) |
| expected = pa.array([True, True, False, None]) |
| assert expected.equals(result) |
| result = pc.match_substring(arr, "áb", ignore_case=False) |
| expected = pa.array([False, False, False, None]) |
| assert expected.equals(result) |
| |
| |
| def test_match_substring_regex(): |
| arr = pa.array(["ab", "abc", "ba", "c", None]) |
| result = pc.match_substring_regex(arr, "^a?b") |
| expected = pa.array([True, True, True, False, None]) |
| assert expected.equals(result) |
| |
| arr = pa.array(["aB", "Abc", "BA", "c", None]) |
| result = pc.match_substring_regex(arr, "^a?b", ignore_case=True) |
| expected = pa.array([True, True, True, False, None]) |
| assert expected.equals(result) |
| result = pc.match_substring_regex(arr, "^a?b", ignore_case=False) |
| expected = pa.array([False, False, False, False, None]) |
| assert expected.equals(result) |
| |
| |
| def test_trim(): |
| # \u3000 is unicode whitespace |
| arr = pa.array([" foo", None, " \u3000foo bar \t"]) |
| result = pc.utf8_trim_whitespace(arr) |
| expected = pa.array(["foo", None, "foo bar"]) |
| assert expected.equals(result) |
| |
| arr = pa.array([" foo", None, " \u3000foo bar \t"]) |
| result = pc.ascii_trim_whitespace(arr) |
| expected = pa.array(["foo", None, "\u3000foo bar"]) |
| assert expected.equals(result) |
| |
| arr = pa.array([" foo", None, " \u3000foo bar \t"]) |
| result = pc.utf8_trim(arr, characters=' f\u3000') |
| expected = pa.array(["oo", None, "oo bar \t"]) |
| assert expected.equals(result) |
| # Positional option |
| result = pc.utf8_trim(arr, ' f\u3000') |
| expected = pa.array(["oo", None, "oo bar \t"]) |
| assert expected.equals(result) |
| |
| |
| def test_slice_compatibility(): |
| arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"]) |
| for start in range(-6, 6): |
| for stop in itertools.chain(range(-6, 6), [None]): |
| for step in [-3, -2, -1, 1, 2, 3]: |
| expected = pa.array([k.as_py()[start:stop:step] |
| for k in arr]) |
| result = pc.utf8_slice_codeunits( |
| arr, start=start, stop=stop, step=step) |
| assert expected.equals(result) |
| # Positional options |
| assert pc.utf8_slice_codeunits(arr, |
| start, stop, step) == result |
| |
| |
| def test_binary_slice_compatibility(): |
| data = [b"", b"a", b"a\xff", b"ab\x00", b"abc\xfb", b"ab\xf2de"] |
| arr = pa.array(data) |
| for start, stop, step in itertools.product(range(-6, 6), |
| range(-6, 6), |
| range(-3, 4)): |
| if step == 0: |
| continue |
| expected = pa.array([k.as_py()[start:stop:step] |
| for k in arr]) |
| result = pc.binary_slice( |
| arr, start=start, stop=stop, step=step) |
| assert expected.equals(result) |
| # Positional options |
| assert pc.binary_slice(arr, start, stop, step) == result |
| # Fixed size binary input / output |
| for item in data: |
| fsb_scalar = pa.scalar(item, type=pa.binary(len(item))) |
| expected = item[start:stop:step] |
| actual = pc.binary_slice(fsb_scalar, start, stop, step) |
| assert actual.type == pa.binary(len(expected)) |
| assert actual.as_py() == expected |
| |
| |
| def test_split_pattern(): |
| arr = pa.array(["-foo---bar--", "---foo---b"]) |
| result = pc.split_pattern(arr, pattern="---") |
| expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]]) |
| assert expected.equals(result) |
| |
| result = pc.split_pattern(arr, "---", max_splits=1) |
| expected = pa.array([["-foo", "bar--"], ["", "foo---b"]]) |
| assert expected.equals(result) |
| |
| result = pc.split_pattern(arr, "---", max_splits=1, reverse=True) |
| expected = pa.array([["-foo", "bar--"], ["---foo", "b"]]) |
| assert expected.equals(result) |
| |
| |
| def test_split_whitespace_utf8(): |
| arr = pa.array(["foo bar", " foo \u3000\tb"]) |
| result = pc.utf8_split_whitespace(arr) |
| expected = pa.array([["foo", "bar"], ["", "foo", "b"]]) |
| assert expected.equals(result) |
| |
| result = pc.utf8_split_whitespace(arr, max_splits=1) |
| expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) |
| assert expected.equals(result) |
| |
| result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True) |
| expected = pa.array([["foo", "bar"], [" foo", "b"]]) |
| assert expected.equals(result) |
| |
| |
| def test_split_whitespace_ascii(): |
| arr = pa.array(["foo bar", " foo \u3000\tb"]) |
| result = pc.ascii_split_whitespace(arr) |
| expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]]) |
| assert expected.equals(result) |
| |
| result = pc.ascii_split_whitespace(arr, max_splits=1) |
| expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) |
| assert expected.equals(result) |
| |
| result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True) |
| expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]]) |
| assert expected.equals(result) |
| |
| |
| def test_split_pattern_regex(): |
| arr = pa.array(["-foo---bar--", "---foo---b"]) |
| result = pc.split_pattern_regex(arr, pattern="-+") |
| expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]]) |
| assert expected.equals(result) |
| |
| result = pc.split_pattern_regex(arr, "-+", max_splits=1) |
| expected = pa.array([["", "foo---bar--"], ["", "foo---b"]]) |
| assert expected.equals(result) |
| |
| with pytest.raises(NotImplementedError, |
| match="Cannot split in reverse with regex"): |
| result = pc.split_pattern_regex( |
| arr, pattern="---", max_splits=1, reverse=True) |
| |
| |
| def test_min_max(): |
| # An example generated function wrapper with possible options |
| data = [4, 5, 6, None, 1] |
| s = pc.min_max(data) |
| assert s.as_py() == {'min': 1, 'max': 6} |
| s = pc.min_max(data, options=pc.ScalarAggregateOptions()) |
| assert s.as_py() == {'min': 1, 'max': 6} |
| s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) |
| assert s.as_py() == {'min': 1, 'max': 6} |
| s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) |
| assert s.as_py() == {'min': None, 'max': None} |
| |
| # Options as dict of kwargs |
| s = pc.min_max(data, options={'skip_nulls': False}) |
| assert s.as_py() == {'min': None, 'max': None} |
| # Options as named functions arguments |
| s = pc.min_max(data, skip_nulls=False) |
| assert s.as_py() == {'min': None, 'max': None} |
| |
| # Both options and named arguments |
| with pytest.raises(TypeError): |
| s = pc.min_max( |
| data, options=pc.ScalarAggregateOptions(), skip_nulls=False) |
| |
| # Wrong options type |
| options = pc.TakeOptions() |
| with pytest.raises(TypeError): |
| s = pc.min_max(data, options=options) |
| |
| # Missing argument |
| with pytest.raises(TypeError, match="min_max takes 1 positional"): |
| s = pc.min_max() |
| |
| |
| def test_any(): |
| # ARROW-1846 |
| |
| options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) |
| |
| a = pa.array([], type='bool') |
| assert pc.any(a).as_py() is None |
| assert pc.any(a, min_count=0).as_py() is False |
| assert pc.any(a, options=options).as_py() is False |
| |
| a = pa.array([False, None, True]) |
| assert pc.any(a).as_py() is True |
| assert pc.any(a, options=options).as_py() is True |
| |
| a = pa.array([False, None, False]) |
| assert pc.any(a).as_py() is False |
| assert pc.any(a, options=options).as_py() is None |
| |
| |
| def test_all(): |
| # ARROW-10301 |
| |
| options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) |
| |
| a = pa.array([], type='bool') |
| assert pc.all(a).as_py() is None |
| assert pc.all(a, min_count=0).as_py() is True |
| assert pc.all(a, options=options).as_py() is True |
| |
| a = pa.array([False, True]) |
| assert pc.all(a).as_py() is False |
| assert pc.all(a, options=options).as_py() is False |
| |
| a = pa.array([True, None]) |
| assert pc.all(a).as_py() is True |
| assert pc.all(a, options=options).as_py() is None |
| |
| a = pa.chunked_array([[True], [True, None]]) |
| assert pc.all(a).as_py() is True |
| assert pc.all(a, options=options).as_py() is None |
| |
| a = pa.chunked_array([[True], [False]]) |
| assert pc.all(a).as_py() is False |
| assert pc.all(a, options=options).as_py() is False |
| |
| |
| def test_is_valid(): |
| # An example generated function wrapper without options |
| data = [4, 5, None] |
| assert pc.is_valid(data).to_pylist() == [True, True, False] |
| |
| with pytest.raises(TypeError): |
| pc.is_valid(data, options=None) |
| |
| |
| def test_generated_docstrings(): |
| # With options |
| assert pc.min_max.__doc__ == textwrap.dedent("""\ |
| Compute the minimum and maximum values of a numeric array. |
| |
| Null values are ignored by default. |
| This can be changed through ScalarAggregateOptions. |
| |
| Parameters |
| ---------- |
| array : Array-like |
| Argument to compute function. |
| skip_nulls : bool, default True |
| Whether to skip (ignore) nulls in the input. |
| If False, any null in the input forces the output to null. |
| min_count : int, default 1 |
| Minimum number of non-null values in the input. If the number |
| of non-null values is below `min_count`, the output is null. |
| options : pyarrow.compute.ScalarAggregateOptions, optional |
| Alternative way of passing options. |
| memory_pool : pyarrow.MemoryPool, optional |
| If not passed, will allocate memory from the default memory pool. |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> import pyarrow.compute as pc |
| >>> arr1 = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) |
| >>> pc.min_max(arr1) |
| <pyarrow.StructScalar: [('min', 1), ('max', 3)]> |
| |
| Using ``skip_nulls`` to handle null values. |
| |
| >>> arr2 = pa.array([1.0, None, 2.0, 3.0]) |
| >>> pc.min_max(arr2) |
| <pyarrow.StructScalar: [('min', 1.0), ('max', 3.0)]> |
| >>> pc.min_max(arr2, skip_nulls=False) |
| <pyarrow.StructScalar: [('min', None), ('max', None)]> |
| |
| Using ``ScalarAggregateOptions`` to control minimum number of non-null values. |
| |
| >>> arr3 = pa.array([1.0, None, float("nan"), 3.0]) |
| >>> pc.min_max(arr3) |
| <pyarrow.StructScalar: [('min', 1.0), ('max', 3.0)]> |
| >>> pc.min_max(arr3, options=pc.ScalarAggregateOptions(min_count=3)) |
| <pyarrow.StructScalar: [('min', 1.0), ('max', 3.0)]> |
| >>> pc.min_max(arr3, options=pc.ScalarAggregateOptions(min_count=4)) |
| <pyarrow.StructScalar: [('min', None), ('max', None)]> |
| |
| This function also works with string values. |
| |
| >>> arr4 = pa.array(["z", None, "y", "x"]) |
| >>> pc.min_max(arr4) |
| <pyarrow.StructScalar: [('min', 'x'), ('max', 'z')]> |
| """) |
| # Without options |
| assert pc.add.__doc__ == textwrap.dedent("""\ |
| Add the arguments element-wise. |
| |
| Results will wrap around on integer overflow. |
| Use function "add_checked" if you want overflow |
| to return an error. |
| |
| Parameters |
| ---------- |
| x : Array-like or scalar-like |
| Argument to compute function. |
| y : Array-like or scalar-like |
| Argument to compute function. |
| memory_pool : pyarrow.MemoryPool, optional |
| If not passed, will allocate memory from the default memory pool. |
| """) |
| # Varargs with options |
| assert pc.min_element_wise.__doc__ == textwrap.dedent("""\ |
| Find the element-wise minimum value. |
| |
| Nulls are ignored (by default) or propagated. |
| NaN is preferred over null, but not over any valid value. |
| |
| Parameters |
| ---------- |
| *args : Array-like or scalar-like |
| Argument to compute function. |
| skip_nulls : bool, default True |
| Whether to skip (ignore) nulls in the input. |
| If False, any null in the input forces the output to null. |
| options : pyarrow.compute.ElementWiseAggregateOptions, optional |
| Alternative way of passing options. |
| memory_pool : pyarrow.MemoryPool, optional |
| If not passed, will allocate memory from the default memory pool. |
| """) |
| assert pc.filter.__doc__ == textwrap.dedent("""\ |
| Filter with a boolean selection filter. |
| |
| The output is populated with values from the input at positions |
| where the selection filter is non-zero. Nulls in the selection filter |
| are handled based on FilterOptions. |
| |
| Parameters |
| ---------- |
| input : Array-like or scalar-like |
| Argument to compute function. |
| selection_filter : Array-like or scalar-like |
| Argument to compute function. |
| null_selection_behavior : str, default "drop" |
| How to handle nulls in the selection filter. |
| Accepted values are "drop", "emit_null". |
| options : pyarrow.compute.FilterOptions, optional |
| Alternative way of passing options. |
| memory_pool : pyarrow.MemoryPool, optional |
| If not passed, will allocate memory from the default memory pool. |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> arr = pa.array(["a", "b", "c", None, "e"]) |
| >>> mask = pa.array([True, False, None, False, True]) |
| >>> arr.filter(mask) |
| <pyarrow.lib.StringArray object at ...> |
| [ |
| "a", |
| "e" |
| ] |
| >>> arr.filter(mask, null_selection_behavior='emit_null') |
| <pyarrow.lib.StringArray object at ...> |
| [ |
| "a", |
| null, |
| "e" |
| ] |
| """) |
| |
| |
| def test_generated_signatures(): |
| # The self-documentation provided by signatures should show acceptable |
| # options and their default values. |
| |
| # Without options |
| sig = inspect.signature(pc.add) |
| assert str(sig) == "(x, y, /, *, memory_pool=None)" |
| # With options |
| sig = inspect.signature(pc.min_max) |
| assert str(sig) == ("(array, /, *, skip_nulls=True, min_count=1, " |
| "options=None, memory_pool=None)") |
| # With positional options |
| sig = inspect.signature(pc.quantile) |
| assert str(sig) == ("(array, /, q=0.5, *, interpolation='linear', " |
| "skip_nulls=True, min_count=0, " |
| "options=None, memory_pool=None)") |
| # Varargs with options |
| sig = inspect.signature(pc.binary_join_element_wise) |
| assert str(sig) == ("(*strings, null_handling='emit_null', " |
| "null_replacement='', options=None, " |
| "memory_pool=None)") |
| # Varargs without options |
| sig = inspect.signature(pc.choose) |
| assert str(sig) == "(indices, /, *values, memory_pool=None)" |
| # Nullary with options |
| sig = inspect.signature(pc.random) |
| assert str(sig) == ("(n, *, initializer='system', " |
| "options=None, memory_pool=None)") |
| |
| |
| # We use isprintable to find about codepoints that Python doesn't know, but |
| # utf8proc does (or in a future version of Python the other way around). |
| # These codepoints cannot be compared between Arrow and the Python |
| # implementation. |
| @lru_cache() |
| def find_new_unicode_codepoints(): |
| new = set() |
| characters = [chr(c) for c in range(0x80, 0x11000) |
| if not (0xD800 <= c < 0xE000)] |
| is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist() |
| for i, c in enumerate(characters): |
| if is_printable[i] != c.isprintable(): |
| new.add(ord(c)) |
| return new |
| |
| |
| # Python claims there are not alpha, not sure why, they are in |
| # gc='Other Letter': https://graphemica.com/%E1%B3%B2 |
| unknown_issue_is_alpha = {0x1cf2, 0x1cf3} |
| # utf8proc does not know if codepoints are lower case |
| utf8proc_issue_is_lower = { |
| 0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, |
| 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, |
| 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d, |
| 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, |
| 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, |
| 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, |
| 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, |
| 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, |
| 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, |
| 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, |
| 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, |
| 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, |
| 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, |
| 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, |
| 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, |
| 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, |
| 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, |
| 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, |
| 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, |
| 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, |
| 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, |
| 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, |
| 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, |
| 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } |
| # utf8proc does not store if a codepoint is numeric |
| numeric_info_missing = { |
| 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, |
| 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, |
| 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, |
| 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, |
| 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, |
| 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, |
| 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, |
| 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, |
| 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, |
| 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, |
| 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, |
| 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, |
| 0x10fcb, } |
| # utf8proc has no no digit/numeric information |
| digit_info_missing = { |
| 0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, |
| 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, |
| 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, |
| 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, |
| 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, |
| 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, |
| 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, |
| 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f, |
| 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, |
| 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, |
| 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, |
| 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, |
| 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, |
| 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, |
| 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, |
| 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } |
| numeric_info_missing = { |
| 0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, |
| 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, |
| 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, |
| 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, |
| 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, |
| 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, |
| 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, |
| 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, |
| 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, |
| 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, |
| 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, } |
| |
| codepoints_ignore = { |
| 'is_alnum': numeric_info_missing | digit_info_missing | |
| unknown_issue_is_alpha, |
| 'is_alpha': unknown_issue_is_alpha, |
| 'is_digit': digit_info_missing, |
| 'is_numeric': numeric_info_missing, |
| 'is_lower': utf8proc_issue_is_lower |
| } |
| |
| |
| @pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha', |
| 'is_ascii', 'is_decimal', |
| 'is_digit', 'is_lower', |
| 'is_numeric', 'is_printable', |
| 'is_space', 'is_upper', ]) |
| @pytest.mark.parametrize('variant', ['ascii', 'utf8']) |
| def test_string_py_compat_boolean(function_name, variant): |
| arrow_name = variant + "_" + function_name |
| py_name = function_name.replace('_', '') |
| ignore = codepoints_ignore.get(function_name, set()) | \ |
| find_new_unicode_codepoints() |
| for i in range(128 if ascii else 0x11000): |
| if i in range(0xD800, 0xE000): |
| continue # bug? pyarrow doesn't allow utf16 surrogates |
| # the issues we know of, we skip |
| if i in ignore: |
| continue |
| # Compare results with the equivalent Python predicate |
| # (except "is_space" where functions are known to be incompatible) |
| c = chr(i) |
| if hasattr(pc, arrow_name) and function_name != 'is_space': |
| ar = pa.array([c]) |
| arrow_func = getattr(pc, arrow_name) |
| assert arrow_func(ar)[0].as_py() == getattr(c, py_name)() |
| |
| |
| def test_pad(): |
| arr = pa.array([None, 'a', 'abcd']) |
| assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] |
| assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] |
| assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] |
| assert pc.ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd'] |
| assert pc.ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd'] |
| assert pc.ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd'] |
| |
| arr = pa.array([None, 'á', 'abcd']) |
| assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] |
| assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] |
| assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] |
| assert pc.utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd'] |
| assert pc.utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd'] |
| assert pc.utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd'] |
| |
| |
| def test_utf8_zfill(): |
| assert pc.utf8_zfill is pc.utf8_zero_fill |
| |
| # match str.zfill behavior |
| examples = ['A', 'AB', 'ABC', '', '-1', '+1', '1', None] |
| for width in range(0, 6): |
| arr = pa.array(examples) |
| result = pc.utf8_zero_fill(arr, width=width).to_pylist() |
| expected = [x.zfill(width) if x is not None else None for x in examples] |
| assert result == expected, f"Mismatch at width={width}: {result} vs {expected}" |
| |
| # unicode padding character |
| arr = pa.array(["1", "-2", "+3"]) |
| result = pc.utf8_zero_fill(arr, options=pc.ZeroFillOptions( |
| width=5, padding="💠")).to_pylist() |
| assert result == ["💠💠💠💠1", "-💠💠💠2", "+💠💠💠3"] |
| |
| # custom ASCII padding character |
| arr = pa.array(["1", "-2", "+3"]) |
| result = pc.utf8_zero_fill(arr, options=pc.ZeroFillOptions( |
| width=4, padding="x")).to_pylist() |
| assert result == ["xxx1", "-xx2", "+xx3"] |
| |
| # multi-codepoint padding — should raise |
| arr = pa.array(["foo"]) |
| with pytest.raises(pa.ArrowInvalid, match="Padding must be one codepoint"): |
| pc.utf8_zero_fill(arr, options=pc.ZeroFillOptions(width=4, padding="spam")) |
| with pytest.raises(pa.ArrowInvalid, match="Padding must be one codepoint"): |
| pc.utf8_zero_fill(arr, options=pc.ZeroFillOptions(width=4, padding="")) |
| |
| |
| @pytest.mark.pandas |
| def test_replace_slice(): |
| offsets = range(-3, 4) |
| |
| arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde']) |
| series = arr.to_pandas().astype(object).replace({np.nan: None}) |
| for start in offsets: |
| for stop in offsets: |
| expected = series.str.slice_replace(start, stop, 'XX') |
| actual = pc.binary_replace_slice( |
| arr, start=start, stop=stop, replacement='XX') |
| assert actual.tolist() == expected.tolist() |
| # Positional options |
| assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual |
| |
| arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) |
| series = arr.to_pandas().astype(object).replace({np.nan: None}) |
| for start in offsets: |
| for stop in offsets: |
| expected = series.str.slice_replace(start, stop, 'XX') |
| actual = pc.utf8_replace_slice( |
| arr, start=start, stop=stop, replacement='XX') |
| assert actual.tolist() == expected.tolist() |
| |
| |
| def test_replace_plain(): |
| data = pa.array(['foozfoo', 'food', None]) |
| ar = pc.replace_substring(data, pattern='foo', replacement='bar') |
| assert ar.tolist() == ['barzbar', 'bard', None] |
| ar = pc.replace_substring(data, 'foo', 'bar') |
| assert ar.tolist() == ['barzbar', 'bard', None] |
| |
| ar = pc.replace_substring(data, pattern='foo', replacement='bar', |
| max_replacements=1) |
| assert ar.tolist() == ['barzfoo', 'bard', None] |
| ar = pc.replace_substring(data, 'foo', 'bar', max_replacements=1) |
| assert ar.tolist() == ['barzfoo', 'bard', None] |
| |
| |
| def test_replace_regex(): |
| data = pa.array(['foo', 'mood', None]) |
| expected = ['f00', 'm00d', None] |
| ar = pc.replace_substring_regex(data, pattern='(.)oo', replacement=r'\100') |
| assert ar.tolist() == expected |
| ar = pc.replace_substring_regex(data, '(.)oo', replacement=r'\100') |
| assert ar.tolist() == expected |
| ar = pc.replace_substring_regex(data, '(.)oo', r'\100') |
| assert ar.tolist() == expected |
| |
| |
| def test_extract_regex(): |
| ar = pa.array(['a1', 'zb2z']) |
| expected = [{'letter': 'a', 'digit': '1'}, {'letter': 'b', 'digit': '2'}] |
| struct = pc.extract_regex(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d)') |
| assert struct.tolist() == expected |
| struct = pc.extract_regex(ar, r'(?P<letter>[ab])(?P<digit>\d)') |
| assert struct.tolist() == expected |
| |
| |
| def test_extract_regex_span(): |
| ar = pa.array(['a1', 'zb234z']) |
| expected = [{'letter': [0, 1], 'digit': [1, 1]}, |
| {'letter': [1, 1], 'digit': [2, 3]}] |
| struct = pc.extract_regex_span(ar, pattern=r'(?P<letter>[ab])(?P<digit>\d+)') |
| assert struct.tolist() == expected |
| struct = pc.extract_regex_span(ar, r'(?P<letter>[ab])(?P<digit>\d+)') |
| assert struct.tolist() == expected |
| |
| |
| def test_binary_join(): |
| ar_list = pa.array([['foo', 'bar'], None, []]) |
| expected = pa.array(['foo-bar', None, '']) |
| assert pc.binary_join(ar_list, '-').equals(expected) |
| |
| separator_array = pa.array(['1', '2'], type=pa.binary()) |
| expected = pa.array(['a1b', 'c2d'], type=pa.binary()) |
| ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) |
| assert pc.binary_join(ar_list, separator_array).equals(expected) |
| |
| |
| def test_binary_join_element_wise(): |
| null = pa.scalar(None, type=pa.string()) |
| arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']] |
| assert pc.binary_join_element_wise(*arrs).to_pylist() == \ |
| [None, None, 'b--d'] |
| assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' |
| assert pc.binary_join_element_wise('a', null, '-').as_py() is None |
| assert pc.binary_join_element_wise('a', 'b', null).as_py() is None |
| |
| skip = pc.JoinOptions(null_handling='skip') |
| assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \ |
| [None, 'a', 'b--d'] |
| assert pc.binary_join_element_wise( |
| 'a', 'b', '-', options=skip).as_py() == 'a-b' |
| assert pc.binary_join_element_wise( |
| 'a', null, '-', options=skip).as_py() == 'a' |
| assert pc.binary_join_element_wise( |
| 'a', 'b', null, options=skip).as_py() is None |
| |
| replace = pc.JoinOptions(null_handling='replace', null_replacement='spam') |
| assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \ |
| [None, 'a-spam', 'b--d'] |
| assert pc.binary_join_element_wise( |
| 'a', 'b', '-', options=replace).as_py() == 'a-b' |
| assert pc.binary_join_element_wise( |
| 'a', null, '-', options=replace).as_py() == 'a-spam' |
| assert pc.binary_join_element_wise( |
| 'a', 'b', null, options=replace).as_py() is None |
| |
| |
| @pytest.mark.parametrize(('ty', 'values'), all_array_types) |
| def test_take(ty, values): |
| arr = pa.array(values, type=ty) |
| for indices_type in [pa.int8(), pa.int64()]: |
| indices = pa.array([0, 4, 2, None], type=indices_type) |
| result = arr.take(indices) |
| result.validate() |
| expected = pa.array([values[0], values[4], values[2], None], type=ty) |
| assert result.equals(expected) |
| |
| # empty indices |
| indices = pa.array([], type=indices_type) |
| result = arr.take(indices) |
| result.validate() |
| expected = pa.array([], type=ty) |
| assert result.equals(expected) |
| |
| indices = pa.array([2, 5]) |
| with pytest.raises(IndexError): |
| arr.take(indices) |
| |
| indices = pa.array([2, -1]) |
| with pytest.raises(IndexError): |
| arr.take(indices) |
| |
| |
| def test_take_indices_types(): |
| arr = pa.array(range(5)) |
| |
| for indices_type in ['uint8', 'int8', 'uint16', 'int16', |
| 'uint32', 'int32', 'uint64', 'int64']: |
| indices = pa.array([0, 4, 2, None], type=indices_type) |
| result = arr.take(indices) |
| result.validate() |
| expected = pa.array([0, 4, 2, None]) |
| assert result.equals(expected) |
| |
| for indices_type in [pa.float32(), pa.float64()]: |
| indices = pa.array([0, 4, 2], type=indices_type) |
| with pytest.raises(NotImplementedError): |
| arr.take(indices) |
| |
| |
| def test_take_on_chunked_array(): |
| # ARROW-9504 |
| arr = pa.chunked_array([ |
| [ |
| "a", |
| "b", |
| "c", |
| "d", |
| "e" |
| ], |
| [ |
| "f", |
| "g", |
| "h", |
| "i", |
| "j" |
| ] |
| ]) |
| |
| indices = pa.array([0, 5, 1, 6, 9, 2]) |
| result = arr.take(indices) |
| expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]]) |
| assert result.equals(expected) |
| |
| indices = pa.chunked_array([[1], [9, 2]]) |
| result = arr.take(indices) |
| expected = pa.chunked_array([ |
| [ |
| "b" |
| ], |
| [ |
| "j", |
| "c" |
| ] |
| ]) |
| assert result.equals(expected) |
| |
| |
| @pytest.mark.parametrize('ordered', [False, True]) |
| def test_take_dictionary(ordered): |
| arr = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], |
| ordered=ordered) |
| result = arr.take(pa.array([0, 1, 3])) |
| result.validate() |
| assert result.to_pylist() == ['a', 'b', 'a'] |
| assert result.dictionary.to_pylist() == ['a', 'b', 'c'] |
| assert result.type.ordered is ordered |
| |
| |
| def test_take_null_type(): |
| # ARROW-10027 |
| arr = pa.array([None] * 10) |
| chunked_arr = pa.chunked_array([[None] * 5] * 2) |
| batch = pa.record_batch([arr], names=['a']) |
| table = pa.table({'a': arr}) |
| |
| indices = pa.array([1, 3, 7, None]) |
| assert len(arr.take(indices)) == 4 |
| assert len(chunked_arr.take(indices)) == 4 |
| assert len(batch.take(indices).column(0)) == 4 |
| assert len(table.take(indices).column(0)) == 4 |
| |
| |
| @pytest.mark.parametrize(('ty', 'values'), all_array_types) |
| def test_drop_null(ty, values): |
| arr = pa.array(values, type=ty) |
| result = arr.drop_null() |
| result.validate(full=True) |
| indices = [i for i in range(len(arr)) if arr[i].is_valid] |
| expected = arr.take(pa.array(indices)) |
| assert result.equals(expected) |
| |
| |
| def test_drop_null_chunked_array(): |
| arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []]) |
| expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []]) |
| |
| result = arr.drop_null() |
| assert result.equals(expected_drop) |
| |
| |
| def test_drop_null_record_batch(): |
| batch = pa.record_batch( |
| [pa.array(["a", None, "c", "d", None])], names=["a'"]) |
| result = batch.drop_null() |
| expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"]) |
| assert result.equals(expected) |
| |
| batch = pa.record_batch( |
| [pa.array(["a", None, "c", "d", None]), |
| pa.array([None, None, "c", None, "e"])], names=["a'", "b'"]) |
| |
| result = batch.drop_null() |
| expected = pa.record_batch( |
| [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"]) |
| assert result.equals(expected) |
| |
| |
| def test_drop_null_table(): |
| table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"]) |
| expected = pa.table([pa.array(["a", "c", "d"])], names=["a"]) |
| result = table.drop_null() |
| assert result.equals(expected) |
| |
| table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]), |
| pa.chunked_array([["a", None], [None, "d", None]]), |
| pa.chunked_array([["a"], ["b"], [None], ["d", None]])], |
| names=["a", "b", "c"]) |
| expected = pa.table([pa.array(["a", "d"]), |
| pa.array(["a", "d"]), |
| pa.array(["a", "d"])], |
| names=["a", "b", "c"]) |
| result = table.drop_null() |
| assert result.equals(expected) |
| |
| table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]), |
| pa.chunked_array([["A"], ["B"], [None], ["D", None]]), |
| pa.chunked_array([["a`", None], ["c`", "d`", None]])], |
| names=["a", "b", "c"]) |
| expected = pa.table([pa.array(["a", "d"]), |
| pa.array(["A", "D"]), |
| pa.array(["a`", "d`"])], |
| names=["a", "b", "c"]) |
| result = table.drop_null() |
| assert result.equals(expected) |
| |
| |
| def test_drop_null_null_type(): |
| arr = pa.array([None] * 10) |
| chunked_arr = pa.chunked_array([[None] * 5] * 2) |
| batch = pa.record_batch([arr], names=['a']) |
| table = pa.table({'a': arr}) |
| |
| assert len(arr.drop_null()) == 0 |
| assert len(chunked_arr.drop_null()) == 0 |
| assert len(batch.drop_null().column(0)) == 0 |
| assert len(table.drop_null().column(0)) == 0 |
| |
| |
| @pytest.mark.parametrize(('ty', 'values'), all_array_types) |
| def test_filter(ty, values): |
| arr = pa.array(values, type=ty) |
| |
| mask = pa.array([True, False, False, True, None]) |
| result = arr.filter(mask, null_selection_behavior='drop') |
| result.validate() |
| assert result.equals(pa.array([values[0], values[3]], type=ty)) |
| result = arr.filter(mask, null_selection_behavior='emit_null') |
| result.validate() |
| assert result.equals(pa.array([values[0], values[3], None], type=ty)) |
| |
| # non-boolean dtype |
| mask = pa.array([0, 1, 0, 1, 0]) |
| with pytest.raises(NotImplementedError): |
| arr.filter(mask) |
| |
| # wrong length |
| mask = pa.array([True, False, True]) |
| with pytest.raises(ValueError, match="must all be the same length"): |
| arr.filter(mask) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize(('ty', 'values'), all_array_types) |
| def test_filter_numpy_array_mask(ty, values): |
| arr = pa.array(values, type=ty) |
| # same test as test_filter with different array type |
| mask = np.array([True, False, False, True, None]) |
| result = arr.filter(mask, null_selection_behavior='drop') |
| result.validate() |
| assert result.equals(pa.array([values[0], values[3]], type=ty)) |
| |
| |
| def test_filter_chunked_array(): |
| arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) |
| expected_drop = pa.chunked_array([["a"], ["e"]]) |
| expected_null = pa.chunked_array([["a"], [None, "e"]]) |
| |
| for mask in [ |
| # mask is array |
| pa.array([True, False, None, False, True]), |
| # mask is chunked array |
| pa.chunked_array([[True, False, None], [False, True]]), |
| # mask is python object |
| [True, False, None, False, True] |
| ]: |
| result = arr.filter(mask) |
| assert result.equals(expected_drop) |
| result = arr.filter(mask, null_selection_behavior="emit_null") |
| assert result.equals(expected_null) |
| |
| |
| def test_filter_record_batch(): |
| batch = pa.record_batch( |
| [pa.array(["a", None, "c", "d", "e"])], names=["a'"]) |
| |
| # mask is array |
| mask = pa.array([True, False, None, False, True]) |
| result = batch.filter(mask) |
| expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"]) |
| assert result.equals(expected) |
| |
| # GH-38770: mask is chunked array |
| chunked_mask = pa.chunked_array([[True, False], [None], [False, True]]) |
| result = batch.filter(chunked_mask) |
| assert result.equals(expected) |
| |
| result = batch.filter(mask, null_selection_behavior="emit_null") |
| expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"]) |
| assert result.equals(expected) |
| |
| |
| def test_filter_table(): |
| table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"]) |
| expected_drop = pa.table([pa.array(["a", "e"])], names=["a"]) |
| expected_null = pa.table([pa.array(["a", None, "e"])], names=["a"]) |
| |
| for mask in [ |
| # mask is array |
| pa.array([True, False, None, False, True]), |
| # mask is chunked array |
| pa.chunked_array([[True, False], [None, False, True]]), |
| # mask is python object |
| [True, False, None, False, True] |
| ]: |
| result = table.filter(mask) |
| assert result.equals(expected_drop) |
| result = table.filter(mask, null_selection_behavior="emit_null") |
| assert result.equals(expected_null) |
| |
| |
| def test_filter_errors(): |
| arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) |
| batch = pa.record_batch( |
| [pa.array(["a", None, "c", "d", "e"])], names=["a'"]) |
| table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"]) |
| |
| for obj in [arr, batch, table]: |
| # non-boolean dtype |
| mask = pa.array([0, 1, 0, 1, 0]) |
| with pytest.raises(NotImplementedError): |
| obj.filter(mask) |
| |
| # wrong length |
| mask = pa.array([True, False, True]) |
| with pytest.raises(pa.ArrowInvalid, |
| match="must all be the same length"): |
| obj.filter(mask) |
| |
| scalar = pa.scalar(True) |
| for filt in [batch, table, scalar]: |
| with pytest.raises(TypeError): |
| table.filter(filt) |
| |
| |
| def test_filter_null_type(): |
| # ARROW-10027 |
| arr = pa.array([None] * 10) |
| chunked_arr = pa.chunked_array([[None] * 5] * 2) |
| batch = pa.record_batch([arr], names=['a']) |
| table = pa.table({'a': arr}) |
| |
| mask = pa.array([True, False] * 5) |
| assert len(arr.filter(mask)) == 5 |
| assert len(chunked_arr.filter(mask)) == 5 |
| assert len(batch.filter(mask).column(0)) == 5 |
| assert len(table.filter(mask).column(0)) == 5 |
| |
| |
| def test_inverse_permutation(): |
| arr0 = pa.array([], type=pa.int32()) |
| arr = pa.chunked_array([ |
| arr0, [9, 7, 5, 3, 1], [0], [2, 4, 6], [8], arr0, |
| ]) |
| expected = pa.chunked_array([[5, 4, 6, 3, 7, 2, 8, 1, 9, 0]], type=pa.int32()) |
| assert pc.inverse_permutation(arr).equals(expected) |
| |
| options = pc.InversePermutationOptions(max_index=9, output_type=pa.int32()) |
| assert pc.inverse_permutation(arr, options=options).equals(expected) |
| assert pc.inverse_permutation(arr, max_index=-1).equals(expected) |
| |
| with pytest.raises(ArrowIndexError, match="Index out of bounds: 9"): |
| pc.inverse_permutation(arr, max_index=4) |
| |
| |
| def test_scatter(): |
| values = pa.array([True, False, True, True, False, False, True, True, True, False]) |
| indices = pa.array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) |
| expected = pa.array([False, True, True, True, False, |
| False, True, True, False, True]) |
| result = pc.scatter(values, indices) |
| assert result.equals(expected) |
| |
| options = pc.ScatterOptions(max_index=-1) |
| assert pc.scatter(values, indices, options=options).equals(expected) |
| assert pc.scatter(values, indices, max_index=9).equals(expected) |
| |
| with pytest.raises(ArrowIndexError, match="Index out of bounds: 9"): |
| pc.scatter(values, indices, max_index=4) |
| |
| |
| @pytest.mark.parametrize("typ", ["array", "chunked_array"]) |
| def test_compare_array(typ): |
| if typ == "array": |
| def con(values): |
| return pa.array(values) |
| else: |
| def con(values): |
| return pa.chunked_array([values]) |
| |
| arr1 = con([1, 2, 3, 4, None]) |
| arr2 = con([1, 1, 4, None, 4]) |
| |
| result = pc.equal(arr1, arr2) |
| assert result.equals(con([True, False, False, None, None])) |
| |
| result = pc.not_equal(arr1, arr2) |
| assert result.equals(con([False, True, True, None, None])) |
| |
| result = pc.less(arr1, arr2) |
| assert result.equals(con([False, False, True, None, None])) |
| |
| result = pc.less_equal(arr1, arr2) |
| assert result.equals(con([True, False, True, None, None])) |
| |
| result = pc.greater(arr1, arr2) |
| assert result.equals(con([False, True, False, None, None])) |
| |
| result = pc.greater_equal(arr1, arr2) |
| assert result.equals(con([True, True, False, None, None])) |
| |
| |
| @pytest.mark.parametrize("typ", ["array", "chunked_array"]) |
| def test_compare_string_scalar(typ): |
| if typ == "array": |
| def con(values): |
| return pa.array(values) |
| else: |
| def con(values): |
| return pa.chunked_array([values]) |
| |
| arr = con(['a', 'b', 'c', None]) |
| scalar = pa.scalar('b') |
| |
| result = pc.equal(arr, scalar) |
| assert result.equals(con([False, True, False, None])) |
| |
| if typ == "array": |
| nascalar = pa.scalar(None, type="string") |
| result = pc.equal(arr, nascalar) |
| isnull = pc.is_null(result) |
| assert isnull.equals(con([True, True, True, True])) |
| |
| result = pc.not_equal(arr, scalar) |
| assert result.equals(con([True, False, True, None])) |
| |
| result = pc.less(arr, scalar) |
| assert result.equals(con([True, False, False, None])) |
| |
| result = pc.less_equal(arr, scalar) |
| assert result.equals(con([True, True, False, None])) |
| |
| result = pc.greater(arr, scalar) |
| assert result.equals(con([False, False, True, None])) |
| |
| result = pc.greater_equal(arr, scalar) |
| assert result.equals(con([False, True, True, None])) |
| |
| |
| @pytest.mark.parametrize("typ", ["array", "chunked_array"]) |
| def test_compare_scalar(typ): |
| if typ == "array": |
| def con(values): |
| return pa.array(values) |
| else: |
| def con(values): |
| return pa.chunked_array([values]) |
| |
| arr = con([1, 2, 3, None]) |
| scalar = pa.scalar(2) |
| |
| result = pc.equal(arr, scalar) |
| assert result.equals(con([False, True, False, None])) |
| |
| if typ == "array": |
| nascalar = pa.scalar(None, type="int64") |
| result = pc.equal(arr, nascalar) |
| assert result.to_pylist() == [None, None, None, None] |
| |
| result = pc.not_equal(arr, scalar) |
| assert result.equals(con([True, False, True, None])) |
| |
| result = pc.less(arr, scalar) |
| assert result.equals(con([True, False, False, None])) |
| |
| result = pc.less_equal(arr, scalar) |
| assert result.equals(con([True, True, False, None])) |
| |
| result = pc.greater(arr, scalar) |
| assert result.equals(con([False, False, True, None])) |
| |
| result = pc.greater_equal(arr, scalar) |
| assert result.equals(con([False, True, True, None])) |
| |
| |
| def test_compare_chunked_array_mixed(): |
| arr = pa.array([1, 2, 3, 4, None]) |
| arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]]) |
| arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]]) |
| |
| expected = pa.chunked_array([[True, True, True, True, None]]) |
| |
| for left, right in [ |
| (arr, arr_chunked), |
| (arr_chunked, arr), |
| (arr_chunked, arr_chunked2), |
| ]: |
| result = pc.equal(left, right) |
| assert result.equals(expected) |
| |
| |
| def test_arithmetic_add(): |
| left = pa.array([1, 2, 3, 4, 5]) |
| right = pa.array([0, -1, 1, 2, 3]) |
| result = pc.add(left, right) |
| expected = pa.array([1, 1, 4, 6, 8]) |
| assert result.equals(expected) |
| |
| |
| def test_arithmetic_subtract(): |
| left = pa.array([1, 2, 3, 4, 5]) |
| right = pa.array([0, -1, 1, 2, 3]) |
| result = pc.subtract(left, right) |
| expected = pa.array([1, 3, 2, 2, 2]) |
| assert result.equals(expected) |
| |
| |
| def test_arithmetic_multiply(): |
| left = pa.array([1, 2, 3, 4, 5]) |
| right = pa.array([0, -1, 1, 2, 3]) |
| result = pc.multiply(left, right) |
| expected = pa.array([0, -2, 3, 8, 15]) |
| assert result.equals(expected) |
| |
| |
| @pytest.mark.parametrize("ty", ["round", "round_to_multiple"]) |
| def test_round_to_integer(ty): |
| if ty == "round": |
| round = pc.round |
| RoundOptions = partial(pc.RoundOptions, ndigits=0) |
| elif ty == "round_to_multiple": |
| round = pc.round_to_multiple |
| RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1) |
| |
| values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None] |
| rmode_and_expected = { |
| "down": [3, 3, 3, 4, -4, -4, -4, None], |
| "up": [4, 4, 4, 5, -3, -3, -3, None], |
| "towards_zero": [3, 3, 3, 4, -3, -3, -3, None], |
| "towards_infinity": [4, 4, 4, 5, -4, -4, -4, None], |
| "half_down": [3, 3, 4, 4, -3, -4, -4, None], |
| "half_up": [3, 4, 4, 5, -3, -3, -4, None], |
| "half_towards_zero": [3, 3, 4, 4, -3, -3, -4, None], |
| "half_towards_infinity": [3, 4, 4, 5, -3, -4, -4, None], |
| "half_to_even": [3, 4, 4, 4, -3, -4, -4, None], |
| "half_to_odd": [3, 3, 4, 5, -3, -3, -4, None], |
| } |
| for round_mode, expected in rmode_and_expected.items(): |
| options = RoundOptions(round_mode=round_mode) |
| result = round(values, options=options) |
| expected_array = pa.array(expected, type=pa.float64()) |
| assert expected_array.equals(result) |
| |
| |
| @pytest.mark.numpy |
| def test_round(): |
| values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] |
| ndigits_and_expected = { |
| -2: [300, 0, 0, 0, -0, -0, -0, None], |
| -1: [320, 0, 0, 0, -0, -40, -0, None], |
| 0: [320, 4, 3, 5, -3, -35, -3, None], |
| 1: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None], |
| 2: [320, 3.5, 3.08, 4.5, -3.21, -35.12, -3.05, None], |
| } |
| for ndigits, expected in ndigits_and_expected.items(): |
| options = pc.RoundOptions(ndigits, "half_towards_infinity") |
| result = pc.round(values, options=options) |
| np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) |
| assert pc.round(values, ndigits, |
| round_mode="half_towards_infinity") == result |
| assert pc.round(values, ndigits, "half_towards_infinity") == result |
| |
| |
| @pytest.mark.numpy |
| def test_round_to_multiple(): |
| values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] |
| multiple_and_expected = { |
| 0.05: [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3.05, None], |
| pa.scalar(0.1): [320, 3.5, 3.1, 4.5, -3.2, -35.1, -3, None], |
| 2: [320, 4, 4, 4, -4, -36, -4, None], |
| 10: [320, 0, 0, 0, -0, -40, -0, None], |
| pa.scalar(100, type=pa.decimal256(10, 4)): |
| [300, 0, 0, 0, -0, -0, -0, None], |
| } |
| for multiple, expected in multiple_and_expected.items(): |
| options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") |
| result = pc.round_to_multiple(values, options=options) |
| np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) |
| assert pc.round_to_multiple(values, multiple, |
| "half_towards_infinity") == result |
| |
| for multiple in [0, -2, pa.scalar(-10.4)]: |
| with pytest.raises(pa.ArrowInvalid, |
| match="Rounding multiple must be positive"): |
| pc.round_to_multiple(values, multiple=multiple) |
| |
| for multiple in [object, 99999999999999999999999]: |
| with pytest.raises(TypeError, match="is not a valid multiple type"): |
| pc.round_to_multiple(values, multiple=multiple) |
| |
| |
| def test_round_binary(): |
| values = [123.456, 234.567, 345.678, 456.789, 123.456, 234.567, 345.678] |
| scales = pa.array([-3, -2, -1, 0, 1, 2, 3], pa.int32()) |
| expected = pa.array( |
| [0, 200, 350, 457, 123.5, 234.57, 345.678], pa.float64()) |
| assert pc.round_binary(values, scales) == expected |
| |
| expect_zero = pa.scalar(0, pa.float64()) |
| expect_inf = pa.scalar(10, pa.float64()) |
| scale = pa.scalar(-1, pa.int32()) |
| |
| assert pc.round_binary( |
| 5.0, scale, round_mode="half_towards_zero") == expect_zero |
| assert pc.round_binary( |
| 5.0, scale, round_mode="half_towards_infinity") == expect_inf |
| |
| |
| def test_is_null(): |
| arr = pa.array([1, 2, 3, None]) |
| result = arr.is_null() |
| expected = pa.array([False, False, False, True]) |
| assert result.equals(expected) |
| assert result.equals(pc.is_null(arr)) |
| result = arr.is_valid() |
| expected = pa.array([True, True, True, False]) |
| assert result.equals(expected) |
| assert result.equals(pc.is_valid(arr)) |
| |
| arr = pa.chunked_array([[1, 2], [3, None]]) |
| result = arr.is_null() |
| expected = pa.chunked_array([[False, False], [False, True]]) |
| assert result.equals(expected) |
| result = arr.is_valid() |
| expected = pa.chunked_array([[True, True], [True, False]]) |
| assert result.equals(expected) |
| |
| arr = pa.array([1, 2, 3, None, float("nan")]) |
| result = arr.is_null() |
| expected = pa.array([False, False, False, True, False]) |
| assert result.equals(expected) |
| |
| result = arr.is_null(nan_is_null=True) |
| expected = pa.array([False, False, False, True, True]) |
| assert result.equals(expected) |
| |
| |
| def test_is_nan(): |
| arr = pa.array([1, 2, 3, None, float("nan")]) |
| result = arr.is_nan() |
| expected = pa.array([False, False, False, None, True]) |
| assert result.equals(expected) |
| |
| arr = pa.array(["1", "2", None], type=pa.string()) |
| with pytest.raises( |
| ArrowNotImplementedError, match="has no kernel matching input types"): |
| _ = arr.is_nan() |
| |
| with pytest.raises( |
| ArrowNotImplementedError, match="has no kernel matching input types"): |
| arr = pa.array([b'a', b'bb', None], type=pa.large_binary()) |
| _ = arr.is_nan() |
| |
| |
| def test_fill_null(): |
| arr = pa.array([1, 2, None, 4], type=pa.int8()) |
| fill_value = pa.array([5], type=pa.int8()) |
| with pytest.raises(pa.ArrowInvalid, |
| match="Array arguments must all be the same length"): |
| arr.fill_null(fill_value) |
| |
| arr = pa.array([None, None, None, None], type=pa.null()) |
| fill_value = pa.scalar(None, type=pa.null()) |
| result = arr.fill_null(fill_value) |
| expected = pa.array([None, None, None, None]) |
| assert result.equals(expected) |
| |
| arr = pa.array(['a', 'bb', None]) |
| result = arr.fill_null('ccc') |
| expected = pa.array(['a', 'bb', 'ccc']) |
| assert result.equals(expected) |
| |
| arr = pa.array([b'a', b'bb', None], type=pa.large_binary()) |
| result = arr.fill_null('ccc') |
| expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary()) |
| assert result.equals(expected) |
| |
| arr = pa.array(['a', 'bb', None]) |
| result = arr.fill_null(None) |
| expected = pa.array(['a', 'bb', None]) |
| assert result.equals(expected) |
| |
| |
| @pytest.mark.parametrize('arrow_type', numerical_arrow_types) |
| def test_fill_null_array(arrow_type): |
| arr = pa.array([1, 2, None, 4], type=arrow_type) |
| fill_value = pa.scalar(5, type=arrow_type) |
| result = arr.fill_null(fill_value) |
| expected = pa.array([1, 2, 5, 4], type=arrow_type) |
| assert result.equals(expected) |
| |
| # Implicit conversions |
| result = arr.fill_null(5) |
| assert result.equals(expected) |
| |
| # ARROW-9451: Unsigned integers allow this for some reason |
| if not pa.types.is_unsigned_integer(arr.type): |
| with pytest.raises((ValueError, TypeError)): |
| arr.fill_null('5') |
| |
| result = arr.fill_null(pa.scalar(5, type='int8')) |
| assert result.equals(expected) |
| |
| |
| @pytest.mark.parametrize('arrow_type', numerical_arrow_types) |
| def test_fill_null_chunked_array(arrow_type): |
| fill_value = pa.scalar(5, type=arrow_type) |
| arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)]) |
| result = arr.fill_null(fill_value) |
| expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)]) |
| assert result.equals(expected) |
| |
| arr = pa.chunked_array([ |
| pa.array([1, 2], type=arrow_type), |
| pa.array([], type=arrow_type), |
| pa.array([None, 4], type=arrow_type) |
| ]) |
| expected = pa.chunked_array([ |
| pa.array([1, 2], type=arrow_type), |
| pa.array([], type=arrow_type), |
| pa.array([5, 4], type=arrow_type) |
| ]) |
| result = arr.fill_null(fill_value) |
| assert result.equals(expected) |
| |
| # Implicit conversions |
| result = arr.fill_null(5) |
| assert result.equals(expected) |
| |
| result = arr.fill_null(pa.scalar(5, type='int8')) |
| assert result.equals(expected) |
| |
| |
| def test_fill_null_windows_regression(): |
| # Regression test for GH-47234, which was failing due to a MSVC compiler bug |
| # (possibly https://developercommunity.visualstudio.com/t/10912292 |
| # or https://developercommunity.visualstudio.com/t/10945478) |
| arr = pa.array([True, False, False, False, False, None]) |
| s = pa.scalar(True, type=pa.bool_()) |
| |
| result = pa.compute.call_function("coalesce", [arr, s]) |
| result.validate(full=True) |
| expected = pa.array([True, False, False, False, False, True]) |
| assert result.equals(expected) |
| |
| for ty in [pa.int8(), pa.int16(), pa.int32(), pa.int64()]: |
| arr = pa.array([1, 2, 3, 4, 5, None], type=ty) |
| s = pa.scalar(42, type=ty) |
| |
| result = pa.compute.call_function("coalesce", [arr, s]) |
| result.validate(full=True) |
| expected = pa.array([1, 2, 3, 4, 5, 42], type=ty) |
| assert result.equals(expected) |
| |
| |
| def test_logical(): |
| a = pa.array([True, False, False, None]) |
| b = pa.array([True, True, False, True]) |
| |
| assert pc.and_(a, b) == pa.array([True, False, False, None]) |
| assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) |
| |
| assert pc.or_(a, b) == pa.array([True, True, False, None]) |
| assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) |
| |
| assert pc.xor(a, b) == pa.array([False, True, False, None]) |
| |
| assert pc.invert(a) == pa.array([False, True, True, None]) |
| |
| |
| def test_dictionary_decode(): |
| array = pa.array(["a", "a", "b", "c", "b"]) |
| dictionary_array = array.dictionary_encode() |
| dictionary_array_decode = pc.dictionary_decode(dictionary_array) |
| |
| assert array != dictionary_array |
| |
| assert array == dictionary_array_decode |
| assert array == pc.dictionary_decode(array) |
| assert pc.dictionary_encode(dictionary_array) == dictionary_array |
| |
| |
| def test_cast(): |
| arr = pa.array([1, 2, 3, 4], type='int64') |
| options = pc.CastOptions(pa.int8()) |
| |
| with pytest.raises(TypeError): |
| pc.cast(arr, target_type=None) |
| |
| with pytest.raises(ValueError): |
| pc.cast(arr, 'int32', options=options) |
| |
| with pytest.raises(ValueError): |
| pc.cast(arr, safe=True, options=options) |
| |
| assert pc.cast(arr, options=options) == pa.array( |
| [1, 2, 3, 4], type='int8') |
| |
| arr = pa.array([2 ** 63 - 1], type='int64') |
| allow_overflow_options = pc.CastOptions( |
| pa.int32(), allow_int_overflow=True) |
| |
| with pytest.raises(pa.ArrowInvalid): |
| pc.cast(arr, 'int32') |
| |
| assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32') |
| |
| assert pc.cast(arr, options=allow_overflow_options) == pa.array( |
| [-1], type='int32') |
| |
| arr = pa.array( |
| [datetime.datetime(2010, 1, 1), datetime.datetime(2015, 1, 1)]) |
| expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]') |
| assert pc.cast(arr, 'timestamp[ms]') == expected |
| |
| arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8())) |
| expected = pa.array([["1", "2"], ["3", "4", "5"]], |
| type=pa.list_(pa.utf8())) |
| assert pc.cast(arr, expected.type) == expected |
| |
| |
| @pytest.mark.parametrize('value_type', [pa.date32(), pa.date64()]) |
| def test_identity_cast_dates(value_type): |
| dt = datetime.date(1990, 3, 1) |
| |
| arr = pa.array([dt], type=value_type) |
| assert pc.cast(arr, value_type) == arr |
| |
| |
| @pytest.mark.parametrize('value_type', numerical_arrow_types) |
| def test_fsl_to_fsl_cast(value_type): |
| # Different field name and different type. |
| cast_type = pa.list_(pa.field("element", value_type), 2) |
| |
| dtype = pa.int32() |
| type = pa.list_(pa.field("values", dtype), 2) |
| |
| fsl = pa.FixedSizeListArray.from_arrays( |
| pa.array([1, 2, 3, 4, 5, 6], type=dtype), type=type) |
| assert cast_type == fsl.cast(cast_type).type |
| |
| # Different field name and different type (with null values). |
| fsl = pa.FixedSizeListArray.from_arrays( |
| pa.array([1, None, None, 4, 5, 6], type=dtype), type=type) |
| assert cast_type == fsl.cast(cast_type).type |
| |
| # Null FSL type. |
| dtype = pa.null() |
| type = pa.list_(pa.field("values", dtype), 2) |
| fsl = pa.FixedSizeListArray.from_arrays( |
| pa.array([None, None, None, None, None, None], type=dtype), type=type) |
| assert cast_type == fsl.cast(cast_type).type |
| |
| # Different sized FSL |
| cast_type = pa.list_(pa.field("element", value_type), 3) |
| err_msg = 'Size of FixedSizeList is not the same.' |
| with pytest.raises(pa.lib.ArrowTypeError, match=err_msg): |
| fsl.cast(cast_type) |
| |
| |
| DecimalTypeTraits = namedtuple('DecimalTypeTraits', |
| ('name', 'factory', 'max_precision')) |
| |
| FloatToDecimalCase = namedtuple('FloatToDecimalCase', |
| ('precision', 'scale', 'float_val')) |
| |
| decimal_type_traits = [DecimalTypeTraits('decimal32', pa.decimal32, 9), |
| DecimalTypeTraits('decimal64', pa.decimal64, 18), |
| DecimalTypeTraits('decimal128', pa.decimal128, 38), |
| DecimalTypeTraits('decimal256', pa.decimal256, 76)] |
| |
| |
| def largest_scaled_float_not_above(val, scale): |
| """ |
| Find the largest float f such as `f * 10**scale <= val` |
| """ |
| assert val >= 0 |
| assert scale >= 0 |
| float_val = float(val) / 10**scale |
| if float_val * 10**scale > val: |
| # Take the float just below... it *should* satisfy |
| float_val = np.nextafter(float_val, 0.0) |
| if float_val * 10**scale > val: |
| float_val = np.nextafter(float_val, 0.0) |
| assert float_val * 10**scale <= val |
| return float_val |
| |
| |
| def scaled_float(int_val, scale): |
| """ |
| Return a float representation (possibly approximate) of `int_val**-scale` |
| """ |
| assert isinstance(int_val, int) |
| unscaled = decimal.Decimal(int_val) |
| scaled = unscaled.scaleb(-scale) |
| float_val = float(scaled) |
| return float_val |
| |
| |
| def integral_float_to_decimal_cast_cases(float_ty, max_precision): |
| """ |
| Return FloatToDecimalCase instances with integral values. |
| """ |
| mantissa_digits = 16 |
| for precision in range(1, max_precision, 3): |
| for scale in range(0, precision, 2): |
| yield FloatToDecimalCase(precision, scale, 0.0) |
| yield FloatToDecimalCase(precision, scale, 1.0) |
| epsilon = 10**max(precision - mantissa_digits, scale) |
| abs_maxval = largest_scaled_float_not_above( |
| 10**precision - epsilon, scale) |
| yield FloatToDecimalCase(precision, scale, abs_maxval) |
| |
| |
| def real_float_to_decimal_cast_cases(float_ty, max_precision): |
| """ |
| Return FloatToDecimalCase instances with real values. |
| """ |
| mantissa_digits = 16 |
| for precision in range(1, max_precision, 3): |
| for scale in range(0, precision, 2): |
| epsilon = 2 * 10**max(precision - mantissa_digits, 0) |
| abs_minval = largest_scaled_float_not_above(epsilon, scale) |
| abs_maxval = largest_scaled_float_not_above( |
| 10**precision - epsilon, scale) |
| yield FloatToDecimalCase(precision, scale, abs_minval) |
| yield FloatToDecimalCase(precision, scale, abs_maxval) |
| |
| |
| def random_float_to_decimal_cast_cases(float_ty, max_precision): |
| """ |
| Return random-generated FloatToDecimalCase instances. |
| """ |
| r = random.Random(42) |
| for precision in range(1, max_precision, 6): |
| for scale in range(0, precision, 4): |
| for i in range(20): |
| unscaled = r.randrange(0, 10**precision) |
| float_val = scaled_float(unscaled, scale) |
| assert float_val * 10**scale < 10**precision |
| yield FloatToDecimalCase(precision, scale, float_val) |
| |
| |
| def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx, |
| max_precision): |
| # Use the Python decimal module to build the expected result |
| # using the right precision |
| decimal_ctx.prec = decimal_ty.precision |
| decimal_ctx.rounding = decimal.ROUND_HALF_EVEN |
| expected = decimal_ctx.create_decimal_from_float(float_val) |
| # Round `expected` to `scale` digits after the decimal point |
| expected = expected.quantize(decimal.Decimal(1).scaleb(-decimal_ty.scale)) |
| s = pa.scalar(float_val, type=float_ty) |
| actual = pc.cast(s, decimal_ty).as_py() |
| if actual != expected: |
| # Allow the last digit to vary. The tolerance is higher for |
| # very high precisions as rounding errors can accumulate in |
| # the iterative algorithm (GH-35576). |
| diff_digits = abs(actual - expected) * 10**decimal_ty.scale |
| limit = 2 if decimal_ty.precision < max_precision - 2 else 4 |
| assert diff_digits <= limit, ( |
| f"float_val = {float_val!r}, precision={decimal_ty.precision}, " |
| f"expected = {expected!r}, actual = {actual!r}, " |
| f"diff_digits = {diff_digits!r}") |
| |
| |
| # Cannot test float32 as case generators above assume float64 |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('float_ty', [pa.float64()], ids=str) |
| @pytest.mark.parametrize('decimal_ty', decimal_type_traits, |
| ids=lambda v: v.name) |
| @pytest.mark.parametrize('case_generator', |
| [integral_float_to_decimal_cast_cases, |
| real_float_to_decimal_cast_cases, |
| random_float_to_decimal_cast_cases], |
| ids=['integrals', 'reals', 'random']) |
| def test_cast_float_to_decimal(float_ty, decimal_ty, case_generator): |
| with decimal.localcontext() as ctx: |
| for case in case_generator(float_ty, decimal_ty.max_precision): |
| check_cast_float_to_decimal( |
| float_ty, case.float_val, |
| decimal_ty.factory(case.precision, case.scale), |
| ctx, decimal_ty.max_precision) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('float_ty', [pa.float32(), pa.float64()], ids=str) |
| @pytest.mark.parametrize('decimal_traits', decimal_type_traits, |
| ids=lambda v: v.name) |
| def test_cast_float_to_decimal_random(float_ty, decimal_traits): |
| """ |
| Test float-to-decimal conversion against exactly generated values. |
| """ |
| r = random.Random(43) |
| np_float_ty = { |
| pa.float32(): np.float32, |
| pa.float64(): np.float64, |
| }[float_ty] |
| mantissa_bits = { |
| pa.float32(): 24, |
| pa.float64(): 53, |
| }[float_ty] |
| float_exp_min, float_exp_max = { |
| pa.float32(): (-126, 127), |
| pa.float64(): (-1022, 1023), |
| }[float_ty] |
| mantissa_digits = math.floor(math.log10(2**mantissa_bits)) |
| max_precision = decimal_traits.max_precision |
| |
| # For example, decimal32 <-> float64 |
| if max_precision < mantissa_digits: |
| mantissa_bits = math.floor(math.log2(10**max_precision)) |
| mantissa_digits = math.floor(math.log10(2**mantissa_bits)) |
| |
| with decimal.localcontext() as ctx: |
| precision = mantissa_digits |
| ctx.prec = precision |
| # The scale must be chosen so as |
| # 1) it's within bounds for the decimal type |
| # 2) the floating point exponent is within bounds |
| min_scale = max(-max_precision, |
| precision + math.ceil(math.log10(2**float_exp_min))) |
| max_scale = min(max_precision, |
| math.floor(math.log10(2**float_exp_max))) |
| for scale in range(min_scale, max_scale): |
| decimal_ty = decimal_traits.factory(precision, scale) |
| # We want to random-generate a float from its mantissa bits |
| # and exponent, and compute the expected value in the |
| # decimal domain. The float exponent has to ensure the |
| # expected value doesn't overflow and doesn't lose precision. |
| float_exp = (-mantissa_bits + |
| math.floor(math.log2(10**(precision - scale)))) |
| assert float_exp_min <= float_exp <= float_exp_max |
| for i in range(5): |
| mantissa = r.randrange(0, 2**mantissa_bits) |
| float_val = np.ldexp(np_float_ty(mantissa), float_exp) |
| assert isinstance(float_val, np_float_ty) |
| # Make sure we compute the exact expected value and |
| # round by half-to-even when converting to the expected precision. |
| if float_exp >= 0: |
| expected = decimal.Decimal(mantissa) * 2**float_exp |
| else: |
| expected = decimal.Decimal(mantissa) / 2**-float_exp |
| expected_as_int = round(expected.scaleb(scale)) |
| actual = pc.cast( |
| pa.scalar(float_val, type=float_ty), decimal_ty).as_py() |
| actual_as_int = round(actual.scaleb(scale)) |
| # We allow for a minor rounding error between expected and actual |
| assert abs(actual_as_int - expected_as_int) <= 1 |
| |
| |
| def test_strptime(): |
| arr = pa.array(["5/1/2020", None, "12/13/1900"]) |
| |
| got = pc.strptime(arr, format='%m/%d/%Y', unit='s') |
| expected = pa.array( |
| [datetime.datetime(2020, 5, 1), None, datetime.datetime(1900, 12, 13)], |
| type=pa.timestamp('s')) |
| assert got == expected |
| # Positional format |
| assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got |
| |
| expected = pa.array([datetime.datetime(2020, 1, 5), None, None], |
| type=pa.timestamp('s')) |
| got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) |
| assert got == expected |
| |
| with pytest.raises(pa.ArrowInvalid, |
| match="Failed to parse string: '5/1/2020'"): |
| pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) |
| |
| with pytest.raises(pa.ArrowInvalid, |
| match="Failed to parse string: '5/1/2020'"): |
| pc.strptime(arr, format='%Y-%m-%d', unit='s') |
| |
| got = pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) |
| assert got == pa.array([None, None, None], type=pa.timestamp('s')) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.timezone_data |
| def test_strftime(): |
| times = ["2018-03-10 09:00", "2038-01-31 12:23", None] |
| timezones = ["CET", "UTC", "Europe/Ljubljana"] |
| |
| formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I", |
| "%p", "%M", "%z", "%Z", "%j", "%U", "%W", "%%", "%G", "%V", "%u"] |
| if sys.platform != "win32": |
| # Locale-dependent formats don't match on Windows |
| formats.extend(["%c", "%x", "%X"]) |
| |
| for timezone in timezones: |
| ts = pd.to_datetime(times).tz_localize(timezone) |
| for unit in ["s", "ms", "us", "ns"]: |
| tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) |
| for fmt in formats: |
| options = pc.StrftimeOptions(fmt) |
| result = pc.strftime(tsa, options=options) |
| # cast to the same type as result to ignore string vs large_string |
| expected = pa.array(ts.strftime(fmt)).cast(result.type) |
| assert result.equals(expected) |
| |
| fmt = "%Y-%m-%dT%H:%M:%S" |
| |
| # Default format |
| tsa = pa.array(ts, type=pa.timestamp("s", timezone)) |
| result = pc.strftime(tsa, options=pc.StrftimeOptions()) |
| expected = pa.array(ts.strftime(fmt)).cast(result.type) |
| assert result.equals(expected) |
| |
| # Default format plus timezone |
| tsa = pa.array(ts, type=pa.timestamp("s", timezone)) |
| result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) |
| expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) |
| assert result.equals(expected) |
| |
| # Pandas %S is equivalent to %S in arrow for unit="s" |
| tsa = pa.array(ts, type=pa.timestamp("s", timezone)) |
| options = pc.StrftimeOptions("%S") |
| result = pc.strftime(tsa, options=options) |
| expected = pa.array(ts.strftime("%S")).cast(result.type) |
| assert result.equals(expected) |
| |
| # Pandas %S.%f is equivalent to %S in arrow for unit="us" |
| tsa = pa.array(ts, type=pa.timestamp("us", timezone)) |
| options = pc.StrftimeOptions("%S") |
| result = pc.strftime(tsa, options=options) |
| expected = pa.array(ts.strftime("%S.%f")).cast(result.type) |
| assert result.equals(expected) |
| |
| # Test setting locale |
| tsa = pa.array(ts, type=pa.timestamp("s", timezone)) |
| options = pc.StrftimeOptions(fmt, locale="C") |
| result = pc.strftime(tsa, options=options) |
| expected = pa.array(ts.strftime(fmt)).cast(result.type) |
| assert result.equals(expected) |
| |
| # Test timestamps without timezone |
| fmt = "%Y-%m-%dT%H:%M:%S" |
| ts = pd.to_datetime(times) |
| tsa = pa.array(ts, type=pa.timestamp("s")) |
| result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) |
| expected = pa.array(ts.strftime(fmt)).cast(result.type) |
| |
| # Positional format |
| assert pc.strftime(tsa, fmt) == result |
| |
| assert result.equals(expected) |
| with pytest.raises(pa.ArrowInvalid, |
| match="Timezone not present, cannot convert to string"): |
| pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) |
| with pytest.raises(pa.ArrowInvalid, |
| match="Timezone not present, cannot convert to string"): |
| pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) |
| |
| |
| def _check_datetime_components(timestamps, timezone=None): |
| from pyarrow.vendored.version import Version |
| |
| ts = pd.to_datetime(timestamps).tz_localize( |
| "UTC").tz_convert(timezone).to_series() |
| tsa = pa.array(ts, pa.timestamp("ns", tz=timezone)) |
| |
| subseconds = ((ts.dt.microsecond * 10 ** 3 + |
| ts.dt.nanosecond) * 10 ** -9).round(9) |
| iso_calendar_fields = [ |
| pa.field('iso_year', pa.int64()), |
| pa.field('iso_week', pa.int64()), |
| pa.field('iso_day_of_week', pa.int64()) |
| ] |
| |
| if Version(pd.__version__) < Version("1.1.0"): |
| # https://github.com/pandas-dev/pandas/issues/33206 |
| iso_year = ts.map(lambda x: x.isocalendar()[0]).astype("int64") |
| iso_week = ts.map(lambda x: x.isocalendar()[1]).astype("int64") |
| iso_day = ts.map(lambda x: x.isocalendar()[2]).astype("int64") |
| else: |
| # Casting is required because pandas isocalendar returns int32 |
| # while arrow isocalendar returns int64. |
| iso_year = ts.dt.isocalendar()["year"].astype("int64") |
| iso_week = ts.dt.isocalendar()["week"].astype("int64") |
| iso_day = ts.dt.isocalendar()["day"].astype("int64") |
| |
| iso_calendar = pa.StructArray.from_arrays( |
| [iso_year, iso_week, iso_day], |
| fields=iso_calendar_fields) |
| |
| # Casting is required because pandas with 2.0.0 various numeric |
| # date/time attributes have dtype int32 (previously int64) |
| year = ts.dt.year.astype("int64") |
| month = ts.dt.month.astype("int64") |
| day = ts.dt.day.astype("int64") |
| dayofweek = ts.dt.dayofweek.astype("int64") |
| dayofyear = ts.dt.dayofyear.astype("int64") |
| quarter = ts.dt.quarter.astype("int64") |
| hour = ts.dt.hour.astype("int64") |
| minute = ts.dt.minute.astype("int64") |
| second = ts.dt.second.values.astype("int64") |
| microsecond = ts.dt.microsecond.astype("int64") |
| nanosecond = ts.dt.nanosecond.astype("int64") |
| |
| assert pc.year(tsa).equals(pa.array(year)) |
| assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year)) |
| assert pc.month(tsa).equals(pa.array(month)) |
| assert pc.day(tsa).equals(pa.array(day)) |
| assert pc.day_of_week(tsa).equals(pa.array(dayofweek)) |
| assert pc.day_of_year(tsa).equals(pa.array(dayofyear)) |
| assert pc.iso_year(tsa).equals(pa.array(iso_year)) |
| assert pc.iso_week(tsa).equals(pa.array(iso_week)) |
| assert pc.iso_calendar(tsa).equals(iso_calendar) |
| assert pc.quarter(tsa).equals(pa.array(quarter)) |
| assert pc.hour(tsa).equals(pa.array(hour)) |
| assert pc.minute(tsa).equals(pa.array(minute)) |
| assert pc.second(tsa).equals(pa.array(second)) |
| assert pc.millisecond(tsa).equals(pa.array(microsecond // 10 ** 3)) |
| assert pc.microsecond(tsa).equals(pa.array(microsecond % 10 ** 3)) |
| assert pc.nanosecond(tsa).equals(pa.array(nanosecond)) |
| assert pc.subsecond(tsa).equals(pa.array(subseconds)) |
| assert pc.local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None))) |
| |
| if ts.dt.tz: |
| if ts.dt.tz is datetime.timezone.utc: |
| # datetime with utc returns None for dst() |
| is_dst = [False] * len(ts) |
| else: |
| is_dst = ts.apply(lambda x: x.dst().seconds > 0) |
| assert pc.is_dst(tsa).equals(pa.array(is_dst)) |
| |
| day_of_week_options = pc.DayOfWeekOptions( |
| count_from_zero=False, week_start=1) |
| assert pc.day_of_week(tsa, options=day_of_week_options).equals( |
| pa.array(dayofweek + 1)) |
| |
| week_options = pc.WeekOptions( |
| week_starts_monday=True, count_from_zero=False, |
| first_week_is_fully_in_year=False) |
| assert pc.week(tsa, options=week_options).equals(pa.array(iso_week)) |
| |
| |
| @pytest.mark.pandas |
| def test_extract_datetime_components(request): |
| timestamps = ["1970-01-01T00:00:59.123456789", |
| "2000-02-29T23:23:23.999999999", |
| "2033-05-18T03:33:20.000000000", |
| "2020-01-01T01:05:05.001", |
| "2019-12-31T02:10:10.002", |
| "2019-12-30T03:15:15.003", |
| "2009-12-31T04:20:20.004132", |
| "2010-01-01T05:25:25.005321", |
| "2010-01-03T06:30:30.006163", |
| "2010-01-04T07:35:35.0", |
| "2006-01-01T08:40:40.0", |
| "2005-12-31T09:45:45.0", |
| "2008-12-28T00:00:00.0", |
| "2008-12-29T00:00:00.0", |
| "2012-01-01T01:02:03.0"] |
| timezones = ["UTC", "America/Chicago", "Asia/Kolkata", |
| "Etc/GMT-4", "Etc/GMT+4", "Australia/Broken_Hill"] |
| |
| # Test timezone naive timestamp array |
| _check_datetime_components(timestamps) |
| |
| # Test timezone aware timestamp array |
| if not request.config.pyarrow.is_enabled["timezone_data"]: |
| pytest.skip('Timezone database is not installed on Windows') |
| else: |
| for timezone in timezones: |
| _check_datetime_components(timestamps, timezone) |
| |
| |
| def test_offset_timezone(): |
| arr = pc.strptime(["2012-12-12T12:12:12"], format="%Y-%m-%dT%H:%M:%S", unit="s") |
| zoned_arr = arr.cast(pa.timestamp("s", tz="+05:30")) |
| assert pc.hour(zoned_arr)[0].as_py() == 17 |
| assert pc.minute(zoned_arr)[0].as_py() == 42 |
| |
| |
| @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) |
| def test_iso_calendar_longer_array(unit): |
| # https://github.com/apache/arrow/issues/38655 |
| # ensure correct result for array length > 32 |
| arr = pa.array([datetime.datetime(2022, 1, 2, 9)]*50, pa.timestamp(unit)) |
| result = pc.iso_calendar(arr) |
| expected = pa.StructArray.from_arrays( |
| [[2021]*50, [52]*50, [7]*50], |
| names=['iso_year', 'iso_week', 'iso_day_of_week'] |
| ) |
| assert result.equals(expected) |
| |
| |
| @pytest.mark.pandas |
| @pytest.mark.timezone_data |
| def test_assume_timezone(): |
| ts_type = pa.timestamp("ns") |
| timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789", |
| "2000-02-29T23:23:23.999999999", |
| "2033-05-18T03:33:20.000000000", |
| "2020-01-01T01:05:05.001", |
| "2019-12-31T02:10:10.002", |
| "2019-12-30T03:15:15.003", |
| "2009-12-31T04:20:20.004132", |
| "2010-01-01T05:25:25.005321", |
| "2010-01-03T06:30:30.006163", |
| "2010-01-04T07:35:35.0", |
| "2006-01-01T08:40:40.0", |
| "2005-12-31T09:45:45.0", |
| "2008-12-28T00:00:00.0", |
| "2008-12-29T00:00:00.0", |
| "2012-01-01T01:02:03.0"]) |
| nonexistent = pd.to_datetime(["2015-03-29 02:30:00", |
| "2015-03-29 03:30:00"]) |
| ambiguous = pd.to_datetime(["2018-10-28 01:20:00", |
| "2018-10-28 02:36:00", |
| "2018-10-28 03:46:00"]) |
| ambiguous_array = pa.array(ambiguous, type=ts_type) |
| nonexistent_array = pa.array(nonexistent, type=ts_type) |
| |
| for timezone in ["UTC", "America/Chicago", "Asia/Kolkata"]: |
| options = pc.AssumeTimezoneOptions(timezone) |
| ta = pa.array(timestamps, type=ts_type) |
| expected = timestamps.tz_localize(timezone) |
| result = pc.assume_timezone(ta, options=options) |
| assert result.equals(pa.array(expected)) |
| result = pc.assume_timezone(ta, timezone) # Positional option |
| assert result.equals(pa.array(expected)) |
| |
| ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) |
| with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): |
| pc.assume_timezone(ta_zoned, options=options) |
| |
| invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") |
| with pytest.raises(ValueError, match="not found in timezone database"): |
| pc.assume_timezone(ta, options=invalid_options) |
| |
| timezone = "Europe/Brussels" |
| |
| options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone) |
| options_nonexistent_earliest = pc.AssumeTimezoneOptions( |
| timezone, ambiguous="raise", nonexistent="earliest") |
| options_nonexistent_latest = pc.AssumeTimezoneOptions( |
| timezone, ambiguous="raise", nonexistent="latest") |
| |
| with pytest.raises(ValueError, |
| match="Timestamp doesn't exist in " |
| f"timezone '{timezone}'"): |
| pc.assume_timezone(nonexistent_array, |
| options=options_nonexistent_raise) |
| |
| expected = pa.array(nonexistent.tz_localize( |
| timezone, nonexistent="shift_forward")) |
| result = pc.assume_timezone( |
| nonexistent_array, options=options_nonexistent_latest) |
| expected.equals(result) |
| |
| expected = pa.array(nonexistent.tz_localize( |
| timezone, nonexistent="shift_backward")) |
| result = pc.assume_timezone( |
| nonexistent_array, options=options_nonexistent_earliest) |
| expected.equals(result) |
| |
| options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone) |
| options_ambiguous_latest = pc.AssumeTimezoneOptions( |
| timezone, ambiguous="latest", nonexistent="raise") |
| options_ambiguous_earliest = pc.AssumeTimezoneOptions( |
| timezone, ambiguous="earliest", nonexistent="raise") |
| |
| with pytest.raises(ValueError, |
| match="Timestamp is ambiguous in " |
| f"timezone '{timezone}'"): |
| pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) |
| |
| expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) |
| result = pc.assume_timezone( |
| ambiguous_array, options=options_ambiguous_earliest) |
| result.equals(pa.array(expected)) |
| |
| expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) |
| result = pc.assume_timezone( |
| ambiguous_array, options=options_ambiguous_latest) |
| result.equals(pa.array(expected)) |
| |
| |
| def _check_temporal_rounding(ts, values, unit): |
| unit_shorthand = { |
| "nanosecond": "ns", |
| "microsecond": "us", |
| "millisecond": "ms", |
| "second": "s", |
| "minute": "min", |
| "hour": "h", |
| "day": "D" |
| } |
| greater_unit = { |
| "nanosecond": "us", |
| "microsecond": "ms", |
| "millisecond": "s", |
| "second": "min", |
| "minute": "h", |
| "hour": "D", |
| } |
| ta = pa.array(ts) |
| |
| for value in values: |
| frequency = str(value) + unit_shorthand[unit] |
| options = pc.RoundTemporalOptions(value, unit) |
| |
| result = pc.ceil_temporal(ta, options=options).to_pandas() |
| expected = ts.dt.ceil(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.floor_temporal(ta, options=options).to_pandas() |
| expected = ts.dt.floor(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.round_temporal(ta, options=options).to_pandas() |
| expected = ts.dt.round(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| # Check rounding with calendar_based_origin=True. |
| # Note: rounding to month is not supported in Pandas so we can't |
| # approximate this functionality and exclude unit == "day". |
| if unit != "day": |
| options = pc.RoundTemporalOptions( |
| value, unit, calendar_based_origin=True) |
| origin = ts.dt.floor(greater_unit[unit]) |
| |
| if ta.type.tz is None: |
| result = pc.ceil_temporal(ta, options=options).to_pandas() |
| expected = (ts - origin).dt.ceil(frequency) + origin |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.floor_temporal(ta, options=options).to_pandas() |
| expected = (ts - origin).dt.floor(frequency) + origin |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.round_temporal(ta, options=options).to_pandas() |
| expected = (ts - origin).dt.round(frequency) + origin |
| np.testing.assert_array_equal(result, expected) |
| |
| # Check RoundTemporalOptions partial defaults |
| if unit == "day": |
| result = pc.ceil_temporal(ta, multiple=value).to_pandas() |
| expected = ts.dt.ceil(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.floor_temporal(ta, multiple=value).to_pandas() |
| expected = ts.dt.floor(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.round_temporal(ta, multiple=value).to_pandas() |
| expected = ts.dt.round(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| # We naively test ceil_is_strictly_greater by adding time unit multiple |
| # to regular ceiled timestamp if it is equal to the original timestamp. |
| # This does not work if timestamp is zoned since our logic will not |
| # account for DST jumps. |
| if ta.type.tz is None: |
| options = pc.RoundTemporalOptions( |
| value, unit, ceil_is_strictly_greater=True) |
| result = pc.ceil_temporal(ta, options=options) |
| expected = ts.dt.ceil(frequency) |
| |
| expected = np.where( |
| expected == ts, |
| expected + pd.Timedelta(value, unit_shorthand[unit]), |
| expected) |
| np.testing.assert_array_equal(result, expected) |
| |
| # Check RoundTemporalOptions defaults |
| if unit == "day": |
| frequency = "1D" |
| |
| result = pc.ceil_temporal(ta).to_pandas() |
| expected = ts.dt.ceil(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.floor_temporal(ta).to_pandas() |
| expected = ts.dt.floor(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| result = pc.round_temporal(ta).to_pandas() |
| expected = ts.dt.round(frequency) |
| np.testing.assert_array_equal(result, expected) |
| |
| |
| @pytest.mark.timezone_data |
| @pytest.mark.parametrize('unit', ("nanosecond", "microsecond", "millisecond", |
| "second", "minute", "hour", "day")) |
| @pytest.mark.pandas |
| def test_round_temporal(unit): |
| values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750) |
| timestamps = [ |
| "1923-07-07 08:52:35.203790336", |
| "1931-03-17 10:45:00.641559040", |
| "1932-06-16 01:16:42.911994368", |
| "1941-05-27 11:46:43.822831872", |
| "1943-12-14 07:32:05.424766464", |
| "1954-04-12 04:31:50.699881472", |
| "1966-02-12 17:41:28.693282560", |
| "1967-02-26 05:56:46.922376960", |
| "1975-11-01 10:55:37.016146432", |
| "1982-01-21 18:43:44.517366784", |
| "1992-01-01 00:00:00.100000000", |
| "1999-12-04 05:55:34.794991104", |
| "2026-10-26 08:39:00.316686848"] |
| ts = pd.Series([pd.Timestamp(x, unit="ns") for x in timestamps]) |
| _check_temporal_rounding(ts, values, unit) |
| |
| timezones = ["Asia/Kolkata", "America/New_York", "Etc/GMT-4", "Etc/GMT+4", |
| "Europe/Brussels", "Pacific/Marquesas", "America/Chicago", "UTC"] |
| |
| for timezone in timezones: |
| ts_zoned = ts.dt.tz_localize("UTC").dt.tz_convert(timezone) |
| _check_temporal_rounding(ts_zoned, values, unit) |
| |
| |
| def test_count(): |
| arr = pa.array([1, 2, 3, None, None]) |
| assert pc.count(arr).as_py() == 3 |
| assert pc.count(arr, mode='only_valid').as_py() == 3 |
| assert pc.count(arr, mode='only_null').as_py() == 2 |
| assert pc.count(arr, mode='all').as_py() == 5 |
| assert pc.count(arr, 'all').as_py() == 5 |
| |
| with pytest.raises(ValueError, |
| match='"something else" is not a valid count mode'): |
| pc.count(arr, 'something else') |
| |
| |
| def test_index(): |
| arr = pa.array([0, 1, None, 3, 4], type=pa.int64()) |
| assert pc.index(arr, pa.scalar(0)).as_py() == 0 |
| assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1 |
| assert pc.index(arr, 4).as_py() == 4 |
| assert arr.index(3, start=2).as_py() == 3 |
| assert arr.index(None).as_py() == -1 |
| |
| arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64()) |
| assert arr.index(1).as_py() == 0 |
| assert arr.index(1, start=2).as_py() == 2 |
| assert arr.index(1, start=1, end=2).as_py() == -1 |
| |
| |
| def check_partition_nth(data, indices, pivot, null_placement): |
| indices = indices.to_pylist() |
| assert len(indices) == len(data) |
| assert sorted(indices) == list(range(len(data))) |
| until_pivot = [data[indices[i]] for i in range(pivot)] |
| after_pivot = [data[indices[i]] for i in range(pivot, len(data))] |
| p = data[indices[pivot]] |
| if p is None: |
| if null_placement == "at_start": |
| assert all(v is None for v in until_pivot) |
| else: |
| assert all(v is None for v in after_pivot) |
| else: |
| if null_placement == "at_start": |
| assert all(v is None or v <= p for v in until_pivot) |
| assert all(v >= p for v in after_pivot) |
| else: |
| assert all(v <= p for v in until_pivot) |
| assert all(v is None or v >= p for v in after_pivot) |
| |
| |
| def test_partition_nth(): |
| data = list(range(100, 140)) |
| random.shuffle(data) |
| pivot = 10 |
| indices = pc.partition_nth_indices(data, pivot=pivot) |
| check_partition_nth(data, indices, pivot, "at_end") |
| # Positional pivot argument |
| assert pc.partition_nth_indices(data, pivot) == indices |
| |
| with pytest.raises( |
| ValueError, |
| match="'partition_nth_indices' cannot be called without options"): |
| pc.partition_nth_indices(data) |
| |
| |
| def test_partition_nth_null_placement(): |
| data = list(range(10)) + [None] * 10 |
| random.shuffle(data) |
| |
| for pivot in (0, 7, 13, 19): |
| for null_placement in ("at_start", "at_end"): |
| indices = pc.partition_nth_indices(data, pivot=pivot, |
| null_placement=null_placement) |
| check_partition_nth(data, indices, pivot, null_placement) |
| |
| |
| def test_select_k_array(): |
| def validate_select_k(select_k_indices, arr, order, stable_sort=False): |
| sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)]) |
| head_k_indices = sorted_indices.slice(0, len(select_k_indices)) |
| if stable_sort: |
| assert select_k_indices == head_k_indices |
| else: |
| expected = pc.take(arr, head_k_indices) |
| actual = pc.take(arr, select_k_indices) |
| assert actual == expected |
| |
| arr = pa.array([1, 2, None, 0]) |
| for k in [0, 2, 4]: |
| for order in ["descending", "ascending"]: |
| result = pc.select_k_unstable( |
| arr, k=k, sort_keys=[("dummy", order)]) |
| validate_select_k(result, arr, order) |
| |
| result = pc.top_k_unstable(arr, k=k) |
| validate_select_k(result, arr, "descending") |
| |
| result = pc.bottom_k_unstable(arr, k=k) |
| validate_select_k(result, arr, "ascending") |
| |
| result = pc.select_k_unstable( |
| arr, options=pc.SelectKOptions( |
| k=2, sort_keys=[("dummy", "descending")]) |
| ) |
| validate_select_k(result, arr, "descending") |
| |
| result = pc.select_k_unstable( |
| arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")]) |
| ) |
| validate_select_k(result, arr, "ascending") |
| |
| # Position options |
| assert pc.select_k_unstable(arr, 2, |
| sort_keys=[("dummy", "ascending")]) == result |
| assert pc.select_k_unstable(arr, 2, [("dummy", "ascending")]) == result |
| |
| |
| def test_select_k_table(): |
| def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): |
| sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys) |
| head_k_indices = sorted_indices.slice(0, len(select_k_indices)) |
| if stable_sort: |
| assert select_k_indices == head_k_indices |
| else: |
| expected = pc.take(tbl, head_k_indices) |
| actual = pc.take(tbl, select_k_indices) |
| assert actual == expected |
| |
| table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]}) |
| for k in [0, 2, 4]: |
| result = pc.select_k_unstable( |
| table, k=k, sort_keys=[("a", "ascending")]) |
| validate_select_k(result, table, sort_keys=[("a", "ascending")]) |
| |
| result = pc.select_k_unstable( |
| table, k=k, sort_keys=[(pc.field("a"), "ascending"), ("b", "ascending")]) |
| validate_select_k( |
| result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) |
| |
| result = pc.top_k_unstable(table, k=k, sort_keys=["a"]) |
| validate_select_k(result, table, sort_keys=[("a", "descending")]) |
| |
| result = pc.bottom_k_unstable(table, k=k, sort_keys=["a", "b"]) |
| validate_select_k( |
| result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) |
| |
| with pytest.raises( |
| ValueError, |
| match="'select_k_unstable' cannot be called without options"): |
| pc.select_k_unstable(table) |
| |
| with pytest.raises(ValueError, |
| match="select_k_unstable requires a nonnegative `k`"): |
| pc.select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")]) |
| |
| with pytest.raises(ValueError, |
| match="select_k_unstable requires a " |
| "non-empty `sort_keys`"): |
| pc.select_k_unstable(table, k=2, sort_keys=[]) |
| |
| with pytest.raises(ValueError, match="not a valid sort order"): |
| pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) |
| |
| with pytest.raises(ValueError, |
| match="Invalid sort key column: No match for.*unknown"): |
| pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) |
| |
| |
| def test_array_sort_indices(): |
| arr = pa.array([1, 2, None, 0]) |
| result = pc.array_sort_indices(arr) |
| assert result.to_pylist() == [3, 0, 1, 2] |
| result = pc.array_sort_indices(arr, order="ascending") |
| assert result.to_pylist() == [3, 0, 1, 2] |
| result = pc.array_sort_indices(arr, order="descending") |
| assert result.to_pylist() == [1, 0, 3, 2] |
| result = pc.array_sort_indices(arr, order="descending", |
| null_placement="at_start") |
| assert result.to_pylist() == [2, 1, 0, 3] |
| result = pc.array_sort_indices(arr, "descending", |
| null_placement="at_start") |
| assert result.to_pylist() == [2, 1, 0, 3] |
| |
| with pytest.raises(ValueError, match="not a valid sort order"): |
| pc.array_sort_indices(arr, order="nonscending") |
| |
| |
| def test_sort_indices_array(): |
| arr = pa.array([1, 2, None, 0]) |
| result = pc.sort_indices(arr) |
| assert result.to_pylist() == [3, 0, 1, 2] |
| result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")]) |
| assert result.to_pylist() == [3, 0, 1, 2] |
| result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")]) |
| assert result.to_pylist() == [1, 0, 3, 2] |
| result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")], |
| null_placement="at_start") |
| assert result.to_pylist() == [2, 1, 0, 3] |
| # Positional `sort_keys` |
| result = pc.sort_indices(arr, [("dummy", "descending")], |
| null_placement="at_start") |
| assert result.to_pylist() == [2, 1, 0, 3] |
| # Using SortOptions |
| result = pc.sort_indices( |
| arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")]) |
| ) |
| assert result.to_pylist() == [1, 0, 3, 2] |
| result = pc.sort_indices( |
| arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")], |
| null_placement="at_start") |
| ) |
| assert result.to_pylist() == [2, 1, 0, 3] |
| |
| |
| def test_sort_indices_table(): |
| table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]}) |
| |
| result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) |
| assert result.to_pylist() == [3, 0, 1, 2] |
| result = pc.sort_indices(table, sort_keys=[(pc.field("a"), "ascending")], |
| null_placement="at_start") |
| assert result.to_pylist() == [2, 3, 0, 1] |
| |
| result = pc.sort_indices( |
| table, sort_keys=[("a", "descending"), ("b", "ascending")] |
| ) |
| assert result.to_pylist() == [1, 0, 3, 2] |
| result = pc.sort_indices( |
| table, sort_keys=[("a", "descending"), ("b", "ascending")], |
| null_placement="at_start" |
| ) |
| assert result.to_pylist() == [2, 1, 0, 3] |
| # Positional `sort_keys` |
| result = pc.sort_indices( |
| table, [("a", "descending"), ("b", "ascending")], |
| null_placement="at_start" |
| ) |
| assert result.to_pylist() == [2, 1, 0, 3] |
| |
| with pytest.raises(ValueError, match="Must specify one or more sort keys"): |
| pc.sort_indices(table) |
| |
| with pytest.raises(ValueError, |
| match="Invalid sort key column: No match for.*unknown"): |
| pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) |
| |
| with pytest.raises(ValueError, match="not a valid sort order"): |
| pc.sort_indices(table, sort_keys=[("a", "nonscending")]) |
| |
| |
| def test_is_in(): |
| arr = pa.array([1, 2, None, 1, 2, 3]) |
| |
| result = pc.is_in(arr, value_set=pa.array([1, 3, None])) |
| assert result.to_pylist() == [True, False, True, True, False, True] |
| |
| result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) |
| assert result.to_pylist() == [True, False, False, True, False, True] |
| |
| result = pc.is_in(arr, value_set=pa.array([1, 3])) |
| assert result.to_pylist() == [True, False, False, True, False, True] |
| |
| result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) |
| assert result.to_pylist() == [True, False, False, True, False, True] |
| |
| |
| def test_index_in(): |
| arr = pa.array([1, 2, None, 1, 2, 3]) |
| |
| result = pc.index_in(arr, value_set=pa.array([1, 3, None])) |
| assert result.to_pylist() == [0, None, 2, 0, None, 1] |
| |
| result = pc.index_in(arr, value_set=pa.array([1, 3, None]), |
| skip_nulls=True) |
| assert result.to_pylist() == [0, None, None, 0, None, 1] |
| |
| result = pc.index_in(arr, value_set=pa.array([1, 3])) |
| assert result.to_pylist() == [0, None, None, 0, None, 1] |
| |
| result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) |
| assert result.to_pylist() == [0, None, None, 0, None, 1] |
| |
| # Positional value_set |
| result = pc.index_in(arr, pa.array([1, 3]), skip_nulls=True) |
| assert result.to_pylist() == [0, None, None, 0, None, 1] |
| |
| |
| def test_quantile(): |
| arr = pa.array([1, 2, 3, 4]) |
| |
| result = pc.quantile(arr) |
| assert result.to_pylist() == [2.5] |
| |
| result = pc.quantile(arr, interpolation='lower') |
| assert result.to_pylist() == [2] |
| result = pc.quantile(arr, interpolation='higher') |
| assert result.to_pylist() == [3] |
| result = pc.quantile(arr, interpolation='nearest') |
| assert result.to_pylist() == [3] |
| result = pc.quantile(arr, interpolation='midpoint') |
| assert result.to_pylist() == [2.5] |
| result = pc.quantile(arr, interpolation='linear') |
| assert result.to_pylist() == [2.5] |
| |
| arr = pa.array([1, 2]) |
| |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75]) |
| assert result.to_pylist() == [1.25, 1.5, 1.75] |
| |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') |
| assert result.to_pylist() == [1, 1, 1] |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') |
| assert result.to_pylist() == [2, 2, 2] |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') |
| assert result.to_pylist() == [1.5, 1.5, 1.5] |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') |
| assert result.to_pylist() == [1, 1, 2] |
| result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') |
| assert result.to_pylist() == [1.25, 1.5, 1.75] |
| |
| # Positional `q` |
| result = pc.quantile(arr, [0.25, 0.5, 0.75], interpolation='linear') |
| assert result.to_pylist() == [1.25, 1.5, 1.75] |
| |
| with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): |
| pc.quantile(arr, q=1.1) |
| with pytest.raises(ValueError, match="not a valid quantile interpolation"): |
| pc.quantile(arr, interpolation='zzz') |
| |
| |
| def test_tdigest(): |
| arr = pa.array([1, 2, 3, 4]) |
| result = pc.tdigest(arr) |
| assert result.to_pylist() == [2.5] |
| |
| arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) |
| result = pc.tdigest(arr) |
| assert result.to_pylist() == [2.5] |
| |
| arr = pa.array([1, 2, 3, 4]) |
| result = pc.tdigest(arr, q=[0, 0.5, 1]) |
| assert result.to_pylist() == [1, 2.5, 4] |
| |
| arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) |
| result = pc.tdigest(arr, [0, 0.5, 1]) # positional `q` |
| assert result.to_pylist() == [1, 2.5, 4] |
| |
| |
| def test_fill_null_segfault(): |
| # ARROW-12672 |
| arr = pa.array([None], pa.bool_()).fill_null(False) |
| result = arr.cast(pa.int8()) |
| assert result == pa.array([0], pa.int8()) |
| |
| |
| def test_min_max_element_wise(): |
| arr1 = pa.array([1, 2, 3]) |
| arr2 = pa.array([3, 1, 2]) |
| arr3 = pa.array([2, 3, None]) |
| |
| result = pc.max_element_wise(arr1, arr2) |
| assert result == pa.array([3, 2, 3]) |
| result = pc.min_element_wise(arr1, arr2) |
| assert result == pa.array([1, 1, 2]) |
| |
| result = pc.max_element_wise(arr1, arr2, arr3) |
| assert result == pa.array([3, 3, 3]) |
| result = pc.min_element_wise(arr1, arr2, arr3) |
| assert result == pa.array([1, 1, 2]) |
| |
| # with specifying the option |
| result = pc.max_element_wise(arr1, arr3, skip_nulls=True) |
| assert result == pa.array([2, 3, 3]) |
| result = pc.min_element_wise(arr1, arr3, skip_nulls=True) |
| assert result == pa.array([1, 2, 3]) |
| result = pc.max_element_wise( |
| arr1, arr3, options=pc.ElementWiseAggregateOptions()) |
| assert result == pa.array([2, 3, 3]) |
| result = pc.min_element_wise( |
| arr1, arr3, options=pc.ElementWiseAggregateOptions()) |
| assert result == pa.array([1, 2, 3]) |
| |
| # not skipping nulls |
| result = pc.max_element_wise(arr1, arr3, skip_nulls=False) |
| assert result == pa.array([2, 3, None]) |
| result = pc.min_element_wise(arr1, arr3, skip_nulls=False) |
| assert result == pa.array([1, 2, None]) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) |
| @pytest.mark.parametrize('skip_nulls', (True, False)) |
| def test_cumulative_sum(start, skip_nulls): |
| # Exact tests (e.g., integral types) |
| start_int = int(start) |
| starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), |
| pa.scalar(start_int, type=pa.int64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([1, 2, 3]), |
| pa.array([0, None, 20, 30]), |
| pa.chunked_array([[0, None], [20, 30]]) |
| ] |
| expected_arrays = [ |
| pa.array([1, 3, 6]), |
| pa.array([0, None, 20, 50]) |
| if skip_nulls else pa.array([0, None, None, None]), |
| pa.chunked_array([[0, None, 20, 50]]) |
| if skip_nulls else pa.chunked_array([[0, None, None, None]]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) |
| # Add `start` offset to expected array before comparing |
| expected = pc.add(expected_arrays[i], strt if strt is not None |
| else 0) |
| assert result.equals(expected) |
| |
| starts = [None, start, pa.scalar(start, type=pa.float32()), |
| pa.scalar(start, type=pa.float64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([1.125, 2.25, 3.03125]), |
| pa.array([1, np.nan, 2, -3, 4, 5]), |
| pa.array([1, np.nan, None, 3, None, 5]) |
| ] |
| expected_arrays = [ |
| np.array([1.125, 3.375, 6.40625]), |
| np.array([1, np.nan, np.nan, np.nan, np.nan, np.nan]), |
| np.array([1, np.nan, None, np.nan, None, np.nan]) |
| if skip_nulls else np.array([1, np.nan, None, None, None, None]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) |
| # Add `start` offset to expected array before comparing |
| expected = pc.add(expected_arrays[i], strt if strt is not None |
| else 0) |
| np.testing.assert_array_almost_equal(result.to_numpy( |
| zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) |
| |
| for strt in ['a', pa.scalar('arrow'), 1.1]: |
| with pytest.raises(pa.ArrowInvalid): |
| pc.cumulative_sum([1, 2, 3], start=strt) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) |
| @pytest.mark.parametrize('skip_nulls', (True, False)) |
| def test_cumulative_prod(start, skip_nulls): |
| # Exact tests (e.g., integral types) |
| start_int = int(start) |
| starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), |
| pa.scalar(start_int, type=pa.int64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([1, 2, 3]), |
| pa.array([1, None, 20, 5]), |
| pa.chunked_array([[1, None], [20, 5]]) |
| ] |
| expected_arrays = [ |
| pa.array([1, 2, 6]), |
| pa.array([1, None, 20, 100]) |
| if skip_nulls else pa.array([1, None, None, None]), |
| pa.chunked_array([[1, None, 20, 100]]) |
| if skip_nulls else pa.chunked_array([[1, None, None, None]]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) |
| # Multiply `start` offset to expected array before comparing |
| expected = pc.multiply(expected_arrays[i], strt if strt is not None |
| else 1) |
| assert result.equals(expected) |
| |
| starts = [None, start, pa.scalar(start, type=pa.float32()), |
| pa.scalar(start, type=pa.float64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([1.5, 2.5, 3.5]), |
| pa.array([1, np.nan, 2, -3, 4, 5]), |
| pa.array([1, np.nan, None, 3, None, 5]) |
| ] |
| expected_arrays = [ |
| np.array([1.5, 3.75, 13.125]), |
| np.array([1, np.nan, np.nan, np.nan, np.nan, np.nan]), |
| np.array([1, np.nan, None, np.nan, None, np.nan]) |
| if skip_nulls else np.array([1, np.nan, None, None, None, None]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) |
| # Multiply `start` offset to expected array before comparing |
| expected = pc.multiply(expected_arrays[i], strt if strt is not None |
| else 1) |
| np.testing.assert_array_almost_equal(result.to_numpy( |
| zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) |
| |
| for strt in ['a', pa.scalar('arrow'), 1.1]: |
| with pytest.raises(pa.ArrowInvalid): |
| pc.cumulative_prod([1, 2, 3], start=strt) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) |
| @pytest.mark.parametrize('skip_nulls', (True, False)) |
| def test_cumulative_max(start, skip_nulls): |
| # Exact tests (e.g., integral types) |
| start_int = int(start) |
| starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), |
| pa.scalar(start_int, type=pa.int64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([2, 1, 3, 5, 4, 6]), |
| pa.array([2, 1, None, 5, 4, None]), |
| pa.chunked_array([[2, 1, None], [5, 4, None]]) |
| ] |
| expected_arrays = [ |
| pa.array([2, 2, 3, 5, 5, 6]), |
| pa.array([2, 2, None, 5, 5, None]) |
| if skip_nulls else pa.array([2, 2, None, None, None, None]), |
| pa.chunked_array([[2, 2, None, 5, 5, None]]) |
| if skip_nulls else |
| pa.chunked_array([[2, 2, None, None, None, None]]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) |
| # Max `start` offset with expected array before comparing |
| expected = pc.max_element_wise( |
| expected_arrays[i], strt if strt is not None else int(-1e9), |
| skip_nulls=False) |
| assert result.equals(expected) |
| |
| starts = [None, start, pa.scalar(start, type=pa.float32()), |
| pa.scalar(start, type=pa.float64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([2.5, 1.3, 3.7, 5.1, 4.9, 6.2]), |
| pa.array([2.5, 1.3, 3.7, np.nan, 4.9, 6.2]), |
| pa.array([2.5, 1.3, None, np.nan, 4.9, None]) |
| ] |
| expected_arrays = [ |
| np.array([2.5, 2.5, 3.7, 5.1, 5.1, 6.2]), |
| np.array([2.5, 2.5, 3.7, 3.7, 4.9, 6.2]), |
| np.array([2.5, 2.5, None, 2.5, 4.9, None]) |
| if skip_nulls else np.array([2.5, 2.5, None, None, None, None]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) |
| # Max `start` offset with expected array before comparing |
| expected = pc.max_element_wise( |
| expected_arrays[i], strt if strt is not None else -1e9, |
| skip_nulls=False) |
| np.testing.assert_array_almost_equal(result.to_numpy( |
| zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) |
| |
| for strt in ['a', pa.scalar('arrow'), 1.1]: |
| with pytest.raises(pa.ArrowInvalid): |
| pc.cumulative_max([1, 2, 3], start=strt) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) |
| @pytest.mark.parametrize('skip_nulls', (True, False)) |
| def test_cumulative_min(start, skip_nulls): |
| # Exact tests (e.g., integral types) |
| start_int = int(start) |
| starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), |
| pa.scalar(start_int, type=pa.int64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([5, 6, 4, 2, 3, 1]), |
| pa.array([5, 6, None, 2, 3, None]), |
| pa.chunked_array([[5, 6, None], [2, 3, None]]) |
| ] |
| expected_arrays = [ |
| pa.array([5, 5, 4, 2, 2, 1]), |
| pa.array([5, 5, None, 2, 2, None]) |
| if skip_nulls else pa.array([5, 5, None, None, None, None]), |
| pa.chunked_array([[5, 5, None, 2, 2, None]]) |
| if skip_nulls else |
| pa.chunked_array([[5, 5, None, None, None, None]]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) |
| # Min `start` offset with expected array before comparing |
| expected = pc.min_element_wise( |
| expected_arrays[i], strt if strt is not None else int(1e9), |
| skip_nulls=False) |
| assert result.equals(expected) |
| |
| starts = [None, start, pa.scalar(start, type=pa.float32()), |
| pa.scalar(start, type=pa.float64())] |
| for strt in starts: |
| arrays = [ |
| pa.array([5.5, 6.3, 4.7, 2.1, 3.9, 1.2]), |
| pa.array([5.5, 6.3, 4.7, np.nan, 3.9, 1.2]), |
| pa.array([5.5, 6.3, None, np.nan, 3.9, None]) |
| ] |
| expected_arrays = [ |
| np.array([5.5, 5.5, 4.7, 2.1, 2.1, 1.2]), |
| np.array([5.5, 5.5, 4.7, 4.7, 3.9, 1.2]), |
| np.array([5.5, 5.5, None, 5.5, 3.9, None]) |
| if skip_nulls else np.array([5.5, 5.5, None, None, None, None]) |
| ] |
| for i, arr in enumerate(arrays): |
| result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) |
| # Min `start` offset with expected array before comparing |
| expected = pc.min_element_wise( |
| expected_arrays[i], strt if strt is not None else 1e9, |
| skip_nulls=False) |
| np.testing.assert_array_almost_equal(result.to_numpy( |
| zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) |
| |
| for strt in ['a', pa.scalar('arrow'), 1.1]: |
| with pytest.raises(pa.ArrowInvalid): |
| pc.cumulative_max([1, 2, 3], start=strt) |
| |
| |
| def test_make_struct(): |
| assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} |
| |
| assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == { |
| 'i': 1, 's': 'a'} |
| |
| assert pc.make_struct([1, 2, 3], |
| "a b c".split()) == pa.StructArray.from_arrays([ |
| [1, 2, 3], |
| "a b c".split()], names='0 1'.split()) |
| |
| with pytest.raises(ValueError, |
| match="Array arguments must all be the same length"): |
| pc.make_struct([1, 2, 3, 4], "a b c".split()) |
| |
| with pytest.raises(ValueError, match="0 arguments but 2 field names"): |
| pc.make_struct(field_names=['one', 'two']) |
| |
| |
| def test_map_lookup(): |
| ty = pa.map_(pa.utf8(), pa.int32()) |
| arr = pa.array([[('one', 1), ('two', 2)], [('none', 3)], |
| [], [('one', 5), ('one', 7)], None], type=ty) |
| result_first = pa.array([1, None, None, 5, None], type=pa.int32()) |
| result_last = pa.array([1, None, None, 7, None], type=pa.int32()) |
| result_all = pa.array([[1], None, None, [5, 7], None], |
| type=pa.list_(pa.int32())) |
| |
| assert pc.map_lookup(arr, 'one', 'first') == result_first |
| assert pc.map_lookup(arr, pa.scalar( |
| 'one', type=pa.utf8()), 'first') == result_first |
| assert pc.map_lookup(arr, pa.scalar( |
| 'one', type=pa.utf8()), 'last') == result_last |
| assert pc.map_lookup(arr, pa.scalar( |
| 'one', type=pa.utf8()), 'all') == result_all |
| |
| |
| def test_struct_fields_options(): |
| a = pa.array([4, 5, 6], type=pa.int64()) |
| b = pa.array(["bar", None, ""]) |
| c = pa.StructArray.from_arrays([a, b], ["a", "b"]) |
| arr = pa.StructArray.from_arrays([a, c], ["a", "c"]) |
| |
| assert pc.struct_field(arr, '.c.b') == b |
| assert pc.struct_field(arr, b'.c.b') == b |
| assert pc.struct_field(arr, ['c', 'b']) == b |
| assert pc.struct_field(arr, [1, 'b']) == b |
| assert pc.struct_field(arr, (b'c', 'b')) == b |
| assert pc.struct_field(arr, pc.field(('c', 'b'))) == b |
| |
| assert pc.struct_field(arr, '.a') == a |
| assert pc.struct_field(arr, ['a']) == a |
| assert pc.struct_field(arr, 'a') == a |
| assert pc.struct_field(arr, pc.field(('a',))) == a |
| |
| assert pc.struct_field(arr, indices=[1, 1]) == b |
| assert pc.struct_field(arr, (1, 1)) == b |
| assert pc.struct_field(arr, [0]) == a |
| assert pc.struct_field(arr, []) == arr |
| |
| with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): |
| pc.struct_field(arr, 'foo') |
| |
| with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): |
| pc.struct_field(arr, '.c.foo') |
| |
| # drill into a non-struct array and continue to ask for a field |
| with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): |
| pc.struct_field(arr, '.a.foo') |
| |
| with pytest.raises(pa.ArrowInvalid, match="cannot be called without options"): |
| pc.struct_field(arr) |
| |
| |
| def test_case_when(): |
| assert pc.case_when(pc.make_struct([True, False, None], |
| [False, True, None]), |
| [1, 2, 3], |
| [11, 12, 13]) == pa.array([1, 12, None]) |
| |
| |
| def test_list_element(): |
| element_type = pa.struct([('a', pa.float64()), ('b', pa.int8())]) |
| list_type = pa.list_(element_type) |
| l1 = [{'a': .4, 'b': 2}, None, {'a': .2, 'b': 4}, None, {'a': 5.6, 'b': 6}] |
| l2 = [None, {'a': .52, 'b': 3}, {'a': .7, 'b': 4}, None, {'a': .6, 'b': 8}] |
| lists = pa.array([l1, l2], list_type) |
| |
| index = 1 |
| result = pa.compute.list_element(lists, index) |
| expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) |
| assert result.equals(expected) |
| |
| index = 4 |
| result = pa.compute.list_element(lists, index) |
| expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) |
| assert result.equals(expected) |
| |
| |
| def test_count_distinct(): |
| samples = [datetime.datetime(year=y, month=1, day=1) for y in range(1992, 2092)] |
| arr = pa.array(samples, pa.timestamp("ns")) |
| assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64()) |
| |
| |
| def test_count_distinct_options(): |
| arr = pa.array([1, 2, 3, None, None]) |
| assert pc.count_distinct(arr).as_py() == 3 |
| assert pc.count_distinct(arr, mode='only_valid').as_py() == 3 |
| assert pc.count_distinct(arr, mode='only_null').as_py() == 1 |
| assert pc.count_distinct(arr, mode='all').as_py() == 4 |
| assert pc.count_distinct(arr, 'all').as_py() == 4 |
| |
| |
| def test_utf8_normalize(): |
| arr = pa.array(["01²3"]) |
| assert pc.utf8_normalize(arr, form="NFC") == arr |
| assert pc.utf8_normalize(arr, form="NFKC") == pa.array(["0123"]) |
| assert pc.utf8_normalize(arr, "NFD") == arr |
| assert pc.utf8_normalize(arr, "NFKD") == pa.array(["0123"]) |
| with pytest.raises( |
| ValueError, |
| match='"NFZ" is not a valid Unicode normalization form'): |
| pc.utf8_normalize(arr, form="NFZ") |
| |
| |
| def test_random(): |
| # (note negative integer initializers are accepted) |
| for initializer in ['system', 42, -42, b"abcdef"]: |
| assert pc.random(0, initializer=initializer) == \ |
| pa.array([], type=pa.float64()) |
| |
| # System random initialization => outputs all distinct |
| arrays = [tuple(pc.random(100).to_pylist()) for i in range(10)] |
| assert len(set(arrays)) == len(arrays) |
| |
| arrays = [tuple(pc.random(100, initializer=i % 7).to_pylist()) |
| for i in range(0, 100)] |
| assert len(set(arrays)) == 7 |
| |
| # Arbitrary hashable objects can be given as initializer |
| initializers = [object(), (4, 5, 6), "foo"] |
| initializers.extend(os.urandom(10) for i in range(10)) |
| arrays = [tuple(pc.random(100, initializer=i).to_pylist()) |
| for i in initializers] |
| assert len(set(arrays)) == len(arrays) |
| |
| with pytest.raises(TypeError, |
| match=r"initializer should be 'system', an integer, " |
| r"or a hashable object; got \[\]"): |
| pc.random(100, initializer=[]) |
| |
| |
| @pytest.mark.parametrize( |
| "tiebreaker,expected_values", |
| [("min", [3, 1, 4, 6, 4, 6, 1]), |
| ("max", [3, 2, 5, 7, 5, 7, 2]), |
| ("first", [3, 1, 4, 6, 5, 7, 2]), |
| ("dense", [2, 1, 3, 4, 3, 4, 1])] |
| ) |
| def test_rank_options_tiebreaker(tiebreaker, expected_values): |
| arr = pa.array([1.2, 0.0, 5.3, None, 5.3, None, 0.0]) |
| rank_options = pc.RankOptions(sort_keys="ascending", |
| null_placement="at_end", |
| tiebreaker=tiebreaker) |
| result = pc.rank(arr, options=rank_options) |
| expected = pa.array(expected_values, type=pa.uint64()) |
| assert result.equals(expected) |
| |
| |
| def test_rank_options(): |
| arr = pa.array([1.2, 0.0, 5.3, None, 5.3, None, 0.0]) |
| expected = pa.array([3, 1, 4, 6, 5, 7, 2], type=pa.uint64()) |
| |
| # Ensure rank can be called without specifying options |
| result = pc.rank(arr) |
| assert result.equals(expected) |
| |
| # Ensure default RankOptions |
| result = pc.rank(arr, options=pc.RankOptions()) |
| assert result.equals(expected) |
| |
| # Ensure sort_keys tuple usage |
| result = pc.rank(arr, options=pc.RankOptions( |
| sort_keys=[("b", "ascending")]) |
| ) |
| assert result.equals(expected) |
| |
| result = pc.rank(arr, null_placement="at_start") |
| expected_at_start = pa.array([5, 3, 6, 1, 7, 2, 4], type=pa.uint64()) |
| assert result.equals(expected_at_start) |
| |
| result = pc.rank(arr, sort_keys="descending") |
| expected_descending = pa.array([3, 4, 1, 6, 2, 7, 5], type=pa.uint64()) |
| assert result.equals(expected_descending) |
| |
| with pytest.raises(ValueError, |
| match=r'"NonExisting" is not a valid tiebreaker'): |
| pc.RankOptions(sort_keys="descending", |
| null_placement="at_end", |
| tiebreaker="NonExisting") |
| |
| |
| def test_rank_quantile_options(): |
| arr = pa.array([None, 1, None, 2, None]) |
| expected = pa.array([0.7, 0.1, 0.7, 0.3, 0.7], type=pa.float64()) |
| |
| # Ensure rank_quantile can be called without specifying options |
| result = pc.rank_quantile(arr) |
| assert result.equals(expected) |
| |
| # Ensure default RankOptions |
| result = pc.rank_quantile(arr, options=pc.RankQuantileOptions()) |
| assert result.equals(expected) |
| |
| # Ensure sort_keys tuple usage |
| result = pc.rank_quantile(arr, options=pc.RankQuantileOptions( |
| sort_keys=[("b", "ascending")]) |
| ) |
| assert result.equals(expected) |
| |
| result = pc.rank_quantile(arr, null_placement="at_start") |
| expected_at_start = pa.array([0.3, 0.7, 0.3, 0.9, 0.3], type=pa.float64()) |
| assert result.equals(expected_at_start) |
| |
| result = pc.rank_quantile(arr, sort_keys="descending") |
| expected_descending = pa.array([0.7, 0.3, 0.7, 0.1, 0.7], type=pa.float64()) |
| assert result.equals(expected_descending) |
| |
| with pytest.raises(ValueError, match="not a valid sort order"): |
| pc.rank_quantile(arr, sort_keys="XXX") |
| |
| |
| def test_rank_normal_options(): |
| arr = pa.array([None, 1, None, 2, None]) |
| |
| expected = pytest.approx( |
| [0.5244005127080407, -1.2815515655446004, 0.5244005127080407, |
| -0.5244005127080409, 0.5244005127080407]) |
| result = pc.rank_normal(arr) |
| assert result.to_pylist() == expected |
| result = pc.rank_normal(arr, null_placement="at_end", sort_keys="ascending") |
| assert result.to_pylist() == expected |
| result = pc.rank_normal(arr, options=pc.RankQuantileOptions()) |
| assert result.to_pylist() == expected |
| |
| expected = pytest.approx( |
| [-0.5244005127080409, 1.2815515655446004, -0.5244005127080409, |
| 0.5244005127080407, -0.5244005127080409]) |
| result = pc.rank_normal(arr, null_placement="at_start", sort_keys="descending") |
| assert result.to_pylist() == expected |
| result = pc.rank_normal(arr, |
| options=pc.RankQuantileOptions(null_placement="at_start", |
| sort_keys="descending")) |
| assert result.to_pylist() == expected |
| |
| |
| def create_sample_expressions(): |
| # We need a schema for substrait conversion |
| schema = pa.schema([pa.field("i64", pa.int64()), pa.field( |
| "foo", pa.struct([pa.field("bar", pa.string())]))]) |
| |
| # Creates a bunch of sample expressions for testing |
| # serialization and deserialization. The expressions are categorized |
| # to reflect certain nuances in Substrait conversion. |
| a = pc.scalar(1) |
| b = pc.scalar(1.1) |
| c = pc.scalar(True) |
| d = pc.scalar("string") |
| e = pc.scalar(None) |
| f = pc.scalar({'a': 1}) |
| g = pc.scalar(pa.scalar(1)) |
| h = pc.scalar(np.int64(2)) |
| j = pc.scalar(False) |
| k = pc.scalar(0) |
| |
| # These expression consist entirely of literals |
| literal_exprs = [a, b, c, d, e, g, h, j, k] |
| |
| # These expressions include at least one function call |
| exprs_with_call = [a == b, a != b, a > b, c & j, c | j, ~c, d.is_valid(), |
| a + b, a - b, a * b, a / b, pc.negate(a), |
| pc.add(a, b), pc.subtract(a, b), pc.divide(a, b), |
| pc.multiply(a, b), pc.power(a, a), pc.sqrt(a), |
| pc.exp(b), pc.cos(b), pc.sin(b), pc.tan(b), |
| pc.acos(b), pc.atan(b), pc.asin(b), pc.atan2(b, b), |
| pc.sinh(a), pc.cosh(a), pc.tanh(a), |
| pc.asinh(a), pc.acosh(b), pc.atanh(k), |
| pc.abs(b), pc.sign(a), pc.bit_wise_not(a), |
| pc.bit_wise_and(a, a), pc.bit_wise_or(a, a), |
| pc.bit_wise_xor(a, a), pc.is_nan(b), pc.is_finite(b), |
| pc.coalesce(a, b), |
| a.cast(pa.int32(), safe=False)] |
| |
| # These expressions test out various reference styles and may include function |
| # calls. Named references are used here. |
| exprs_with_ref = [pc.field('i64') > 5, pc.field('i64') == 5, |
| pc.field('i64') == 7, |
| pc.field(('foo', 'bar')) == 'value', |
| pc.field('foo', 'bar') == 'value'] |
| |
| # Similar to above but these use numeric references instead of string refs |
| exprs_with_numeric_refs = [pc.field(0) > 5, pc.field(0) == 5, |
| pc.field(0) == 7, |
| pc.field((1, 0)) == 'value', |
| pc.field(1, 0) == 'value'] |
| |
| # Expressions that behave uniquely when converting to/from substrait |
| special_cases = [ |
| f, # Struct literals lose their field names |
| a.isin([1, 2, 3]), # isin converts to an or list |
| pc.field('i64').is_null() # pyarrow always specifies a FunctionOptions |
| # for is_null which, being the default, is |
| # dropped on serialization |
| ] |
| |
| all_exprs = literal_exprs.copy() |
| all_exprs += exprs_with_call |
| all_exprs += exprs_with_ref |
| all_exprs += special_cases |
| |
| return { |
| "all": all_exprs, |
| "literals": literal_exprs, |
| "calls": exprs_with_call, |
| "refs": exprs_with_ref, |
| "numeric_refs": exprs_with_numeric_refs, |
| "special": special_cases, |
| "schema": schema |
| } |
| |
| # Tests the Arrow-specific serialization mechanism |
| |
| |
| @pytest.mark.numpy |
| def test_expression_serialization_arrow(pickle_module): |
| for expr in create_sample_expressions()["all"]: |
| assert isinstance(expr, pc.Expression) |
| restored = pickle_module.loads(pickle_module.dumps(expr)) |
| assert expr.equals(restored) |
| |
| |
| @pytest.mark.numpy |
| @pytest.mark.substrait |
| def test_expression_serialization_substrait(): |
| |
| exprs = create_sample_expressions() |
| schema = exprs["schema"] |
| |
| # Basic literals don't change on binding and so they will round |
| # trip without any change |
| for expr in exprs["literals"]: |
| serialized = expr.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| assert expr.equals(deserialized) |
| |
| # Expressions are bound when they get serialized. Since bound |
| # expressions are not equal to their unbound variants we cannot |
| # compare the round tripped with the original |
| for expr in exprs["calls"]: |
| serialized = expr.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| # We can't compare the expressions themselves because of the bound |
| # unbound difference. But we can compare the string representation |
| assert str(deserialized) == str(expr) |
| serialized_again = deserialized.to_substrait(schema) |
| deserialized_again = pc.Expression.from_substrait(serialized_again) |
| assert deserialized.equals(deserialized_again) |
| |
| for expr, expr_norm in zip(exprs["refs"], exprs["numeric_refs"]): |
| serialized = expr.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| assert str(deserialized) == str(expr_norm) |
| serialized_again = deserialized.to_substrait(schema) |
| deserialized_again = pc.Expression.from_substrait(serialized_again) |
| assert deserialized.equals(deserialized_again) |
| |
| # For the special cases we get various wrinkles in serialization but we |
| # should always get the same thing from round tripping twice |
| for expr in exprs["special"]: |
| serialized = expr.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| serialized_again = deserialized.to_substrait(schema) |
| deserialized_again = pc.Expression.from_substrait(serialized_again) |
| assert deserialized.equals(deserialized_again) |
| |
| # Special case, we lose the field names of struct literals |
| f = exprs["special"][0] |
| serialized = f.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| assert deserialized.equals(pc.scalar({'': 1})) |
| |
| # Special case, is_in converts to a == opt[0] || a == opt[1] ... |
| a = pc.scalar(1) |
| expr = a.isin([1, 2, 3]) |
| target = (a == 1) | (a == 2) | (a == 3) |
| serialized = expr.to_substrait(schema) |
| deserialized = pc.Expression.from_substrait(serialized) |
| # Compare str's here to bypass the bound/unbound difference |
| assert str(target) == str(deserialized) |
| serialized_again = deserialized.to_substrait(schema) |
| deserialized_again = pc.Expression.from_substrait(serialized_again) |
| assert deserialized.equals(deserialized_again) |
| |
| |
| def test_expression_construction(): |
| zero = pc.scalar(0) |
| one = pc.scalar(1) |
| true = pc.scalar(True) |
| false = pc.scalar(False) |
| string = pc.scalar("string") |
| field = pc.field("field") |
| nested_mixed_types = pc.field(b"a", 1, "b") |
| nested_field = pc.field(("nested", "field")) |
| nested_field2 = pc.field("nested", "field") |
| |
| zero | one == string |
| ~true == false |
| for typ in ("bool", pa.bool_()): |
| field.cast(typ) == true |
| |
| field.isin([1, 2]) |
| nested_mixed_types.isin(["foo", "bar"]) |
| nested_field.isin(["foo", "bar"]) |
| nested_field2.isin(["foo", "bar"]) |
| |
| with pytest.raises(TypeError): |
| field.isin(1) |
| |
| with pytest.raises(pa.ArrowInvalid): |
| field != object() |
| |
| |
| def test_expression_boolean_operators(): |
| # https://issues.apache.org/jira/browse/ARROW-11412 |
| true = pc.scalar(True) |
| false = pc.scalar(False) |
| |
| with pytest.raises(ValueError, match="cannot be evaluated to python True"): |
| true and false |
| |
| with pytest.raises(ValueError, match="cannot be evaluated to python True"): |
| true or false |
| |
| with pytest.raises(ValueError, match="cannot be evaluated to python True"): |
| bool(true) |
| |
| with pytest.raises(ValueError, match="cannot be evaluated to python True"): |
| not true |
| |
| |
| def test_expression_call_function(): |
| field = pc.field("field") |
| |
| # no options |
| assert str(pc.hour(field)) == "hour(field)" |
| |
| # default options |
| assert str(pc.round(field)) == "round(field)" |
| # specified options |
| assert str(pc.round(field, ndigits=1)) == \ |
| "round(field, {ndigits=1, round_mode=HALF_TO_EVEN})" |
| |
| # Will convert non-expression arguments if possible |
| assert str(pc.add(field, 1)) == "add(field, 1)" |
| assert str(pc.add(field, pa.scalar(1))) == "add(field, 1)" |
| |
| # Invalid pc.scalar input gives original error message |
| msg = "only other expressions allowed as arguments" |
| with pytest.raises(TypeError, match=msg): |
| pc.add(field, object) |
| |
| |
| def test_cast_table_raises(): |
| table = pa.table({'a': [1, 2]}) |
| |
| with pytest.raises(pa.lib.ArrowTypeError): |
| pc.cast(table, pa.int64()) |
| |
| |
| @pytest.mark.parametrize("start,stop,expected", ( |
| (0, None, [[1, 2, 3], [4, 5, None], [6, None, None], None]), |
| (0, 1, [[1], [4], [6], None]), |
| (0, 2, [[1, 2], [4, 5], [6, None], None]), |
| (1, 2, [[2], [5], [None], None]), |
| (2, 4, [[3, None], [None, None], [None, None], None]) |
| )) |
| @pytest.mark.parametrize("step", (1, 2)) |
| @pytest.mark.parametrize("value_type", (pa.string, pa.int16, pa.float64)) |
| @pytest.mark.parametrize("list_type", (pa.list_, pa.large_list, "fixed")) |
| def test_list_slice_output_fixed(start, stop, step, expected, value_type, |
| list_type): |
| if list_type == "fixed": |
| arr = pa.array([[1, 2, 3], [4, 5, None], [6, None, None], None], |
| pa.list_(pa.int8(), 3)).cast(pa.list_(value_type(), 3)) |
| else: |
| arr = pa.array([[1, 2, 3], [4, 5], [6], None], |
| pa.list_(pa.int8())).cast(list_type(value_type())) |
| |
| args = arr, start, stop, step, True |
| if stop is None and list_type != "fixed": |
| msg = ("Unable to produce FixedSizeListArray from " |
| "non-FixedSizeListArray without `stop` being set.") |
| with pytest.raises(pa.ArrowInvalid, match=msg): |
| pc.list_slice(*args) |
| else: |
| result = pc.list_slice(*args) |
| pylist = result.cast(pa.list_(pa.int8(), |
| result.type.list_size)).to_pylist() |
| assert pylist == [e[::step] if e else e for e in expected] |
| |
| |
| @pytest.mark.parametrize("start,stop", ( |
| (0, None,), |
| (0, 1,), |
| (0, 2,), |
| (1, 2,), |
| (2, 4,) |
| )) |
| @pytest.mark.parametrize("step", (1, 2)) |
| @pytest.mark.parametrize("value_type", (pa.string, pa.int16, pa.float64)) |
| @pytest.mark.parametrize("list_type", (pa.list_, pa.large_list, "fixed")) |
| def test_list_slice_output_variable(start, stop, step, value_type, list_type): |
| if list_type == "fixed": |
| data = [[1, 2, 3], [4, 5, None], [6, None, None], None] |
| arr = pa.array( |
| data, |
| pa.list_(pa.int8(), 3)).cast(pa.list_(value_type(), 3)) |
| else: |
| data = [[1, 2, 3], [4, 5], [6], None] |
| arr = pa.array(data, |
| pa.list_(pa.int8())).cast(list_type(value_type())) |
| |
| # Gets same list type (ListArray vs LargeList) |
| if list_type == "fixed": |
| list_type = pa.list_ # non fixed output type |
| |
| result = pc.list_slice(arr, start, stop, step, |
| return_fixed_size_list=False) |
| assert result.type == list_type(value_type()) |
| |
| pylist = result.cast(pa.list_(pa.int8())).to_pylist() |
| |
| # Variable output slicing follows Python's slice semantics |
| expected = [d[start:stop:step] if d is not None else None for d in data] |
| assert pylist == expected |
| |
| |
| @pytest.mark.parametrize("return_fixed_size", (True, False, None)) |
| @pytest.mark.parametrize("type", ( |
| lambda: pa.list_(pa.field('col', pa.int8())), |
| lambda: pa.list_(pa.field('col', pa.int8()), 1), |
| lambda: pa.large_list(pa.field('col', pa.int8())))) |
| def test_list_slice_field_names_retained(return_fixed_size, type): |
| arr = pa.array([[1]], type()) |
| out = pc.list_slice(arr, 0, 1, return_fixed_size_list=return_fixed_size) |
| assert arr.type.field(0).name == out.type.field(0).name |
| |
| # Verify out type matches in type if return_fixed_size_list==None |
| if return_fixed_size is None: |
| assert arr.type == out.type |
| |
| |
| def test_list_slice_bad_parameters(): |
| arr = pa.array([[1]], pa.list_(pa.int8(), 1)) |
| msg = r"`start`(.*) should be greater than 0 and smaller than `stop`(.*)" |
| with pytest.raises(pa.ArrowInvalid, match=msg): |
| pc.list_slice(arr, -1, 1) # negative start? |
| with pytest.raises(pa.ArrowInvalid, match=msg): |
| pc.list_slice(arr, 2, 1) # start > stop? |
| |
| # TODO(ARROW-18281): start==stop -> empty lists |
| with pytest.raises(pa.ArrowInvalid, match=msg): |
| pc.list_slice(arr, 0, 0) # start == stop? |
| |
| # Step not >= 1 |
| msg = "`step` must be >= 1, got: " |
| with pytest.raises(pa.ArrowInvalid, match=msg + "0"): |
| pc.list_slice(arr, 0, 1, step=0) |
| with pytest.raises(pa.ArrowInvalid, match=msg + "-1"): |
| pc.list_slice(arr, 0, 1, step=-1) |
| |
| |
| def check_run_end_encode_decode(value_type, run_end_encode_opts=None): |
| values = [1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3] |
| arr = pa.array(values, type=value_type) |
| encoded = pc.run_end_encode(arr, options=run_end_encode_opts) |
| decoded = pc.run_end_decode(encoded) |
| assert decoded.type == arr.type |
| assert decoded.equals(arr) |
| |
| |
| @pytest.mark.parametrize( |
| "value_type", |
| ( |
| pa.int8(), |
| pa.int16(), |
| pa.int32(), |
| pa.int64(), |
| pa.float16(), |
| pa.float32(), |
| pa.float64(), |
| pa.decimal32(4, 0), |
| pa.decimal64(4, 0), |
| pa.decimal128(4, 0), |
| pa.decimal256(4, 0), |
| ), |
| ) |
| @pytest.mark.parametrize( |
| "option", |
| ( |
| None, |
| pc.RunEndEncodeOptions(pa.int16()), |
| pc.RunEndEncodeOptions("int32"), |
| pc.RunEndEncodeOptions(pa.int64()), |
| ), |
| ) |
| def test_run_end_encode(value_type, option): |
| check_run_end_encode_decode(value_type, option) |
| |
| |
| def test_pairwise_diff(): |
| arr = pa.array([1, 2, 3, None, 4, 5]) |
| expected = pa.array([None, 1, 1, None, None, 1]) |
| result = pa.compute.pairwise_diff(arr, period=1) |
| assert result.equals(expected) |
| |
| arr = pa.array([1, 2, 3, None, 4, 5]) |
| expected = pa.array([None, None, 2, None, 1, None]) |
| result = pa.compute.pairwise_diff(arr, period=2) |
| assert result.equals(expected) |
| |
| # negative period |
| arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) |
| expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) |
| result = pa.compute.pairwise_diff(arr, period=-1) |
| assert result.equals(expected) |
| |
| # wrap around overflow |
| arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) |
| expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) |
| result = pa.compute.pairwise_diff(arr, period=-1) |
| assert result.equals(expected) |
| |
| # fail on overflow |
| arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) |
| with pytest.raises(pa.ArrowInvalid, |
| match="overflow"): |
| pa.compute.pairwise_diff_checked(arr, period=-1) |
| |
| |
| def test_pivot_wider(): |
| key_names = ["width", "height"] |
| |
| result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11]) |
| assert result.as_py() == {} |
| |
| result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], |
| key_names) |
| assert result.as_py() == {"width": None, "height": 10} |
| # check key order |
| assert list(result.as_py()) == ["width", "height"] |
| |
| result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], |
| key_names=key_names) |
| assert result.as_py() == {"width": None, "height": 10} |
| |
| with pytest.raises(KeyError, match="Unexpected pivot key: depth"): |
| result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], |
| key_names=key_names, |
| unexpected_key_behavior="raise") |
| |
| with pytest.raises(ValueError, match="Encountered more than one non-null value"): |
| result = pc.pivot_wider(["height", "width", "height"], [10, None, 11], |
| key_names=key_names) |
| |
| |
| def test_winsorize(): |
| arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) |
| |
| result = pc.winsorize(arr, 0.1, 0.8) |
| assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] |
| |
| result = pc.winsorize( |
| arr, options=pc.WinsorizeOptions(lower_limit=0.1, upper_limit=0.8)) |
| assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] |