| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from pyarrow._compute import ( # noqa |
| Function, |
| FunctionOptions, |
| FunctionRegistry, |
| HashAggregateFunction, |
| HashAggregateKernel, |
| Kernel, |
| ScalarAggregateFunction, |
| ScalarAggregateKernel, |
| ScalarFunction, |
| ScalarKernel, |
| VectorFunction, |
| VectorKernel, |
| # Option classes |
| ArraySortOptions, |
| CastOptions, |
| CountOptions, |
| DictionaryEncodeOptions, |
| FilterOptions, |
| MatchSubstringOptions, |
| MinMaxOptions, |
| ModeOptions, |
| SplitOptions, |
| SplitPatternOptions, |
| PartitionNthOptions, |
| ProjectOptions, |
| QuantileOptions, |
| ReplaceSubstringOptions, |
| SetLookupOptions, |
| SortOptions, |
| StrptimeOptions, |
| TakeOptions, |
| TDigestOptions, |
| TrimOptions, |
| VarianceOptions, |
| # Functions |
| function_registry, |
| call_function, |
| get_function, |
| list_functions, |
| ) |
| |
| from textwrap import dedent |
| import warnings |
| |
| import pyarrow as pa |
| |
| |
| def _get_arg_names(func): |
| arg_names = func._doc.arg_names |
| if not arg_names: |
| if func.arity == 1: |
| arg_names = ["arg"] |
| elif func.arity == 2: |
| arg_names = ["left", "right"] |
| else: |
| raise NotImplementedError( |
| f"unsupported arity: {func.arity} (function: {func.name})") |
| |
| return arg_names |
| |
| |
| def _decorate_compute_function(wrapper, exposed_name, func, option_class): |
| wrapper.__arrow_compute_function__ = dict(name=func.name, |
| arity=func.arity) |
| wrapper.__name__ = exposed_name |
| wrapper.__qualname__ = exposed_name |
| |
| doc_pieces = [] |
| |
| cpp_doc = func._doc |
| summary = cpp_doc.summary |
| if not summary: |
| arg_str = "arguments" if func.arity > 1 else "argument" |
| summary = ("Call compute function {!r} with the given {}" |
| .format(func.name, arg_str)) |
| |
| description = cpp_doc.description |
| arg_names = _get_arg_names(func) |
| |
| doc_pieces.append("""\ |
| {}. |
| |
| """.format(summary)) |
| |
| if description: |
| doc_pieces.append("{}\n\n".format(description)) |
| |
| doc_pieces.append("""\ |
| Parameters |
| ---------- |
| """) |
| |
| for arg_name in arg_names: |
| if func.kind in ('vector', 'scalar_aggregate'): |
| arg_type = 'Array-like' |
| else: |
| arg_type = 'Array-like or scalar-like' |
| doc_pieces.append("""\ |
| {} : {} |
| Argument to compute function |
| """.format(arg_name, arg_type)) |
| |
| doc_pieces.append("""\ |
| memory_pool : pyarrow.MemoryPool, optional |
| If not passed, will allocate memory from the default memory pool. |
| """) |
| if option_class is not None: |
| doc_pieces.append("""\ |
| options : pyarrow.compute.{0}, optional |
| Parameters altering compute function semantics |
| **kwargs : optional |
| Parameters for {0} constructor. Either `options` |
| or `**kwargs` can be passed, but not both at the same time. |
| """.format(option_class.__name__)) |
| |
| wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) |
| return wrapper |
| |
| |
| def _get_options_class(func): |
| class_name = func._doc.options_class |
| if not class_name: |
| return None |
| try: |
| return globals()[class_name] |
| except KeyError: |
| warnings.warn("Python binding for {} not exposed" |
| .format(class_name), RuntimeWarning) |
| return None |
| |
| |
| def _handle_options(name, option_class, options, kwargs): |
| if kwargs: |
| if options is None: |
| return option_class(**kwargs) |
| raise TypeError( |
| "Function {!r} called with both an 'options' argument " |
| "and additional named arguments" |
| .format(name)) |
| |
| if options is not None: |
| if isinstance(options, dict): |
| return option_class(**options) |
| elif isinstance(options, option_class): |
| return options |
| raise TypeError( |
| "Function {!r} expected a {} parameter, got {}" |
| .format(name, option_class, type(options))) |
| |
| return options |
| |
| |
| _wrapper_template = dedent("""\ |
| def make_wrapper(func, option_class): |
| def {func_name}({args_sig}{kwonly}, memory_pool=None): |
| return func.call([{args_sig}], None, memory_pool) |
| return {func_name} |
| """) |
| |
| _wrapper_options_template = dedent("""\ |
| def make_wrapper(func, option_class): |
| def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None, |
| **kwargs): |
| options = _handle_options({func_name!r}, option_class, options, |
| kwargs) |
| return func.call([{args_sig}], options, memory_pool) |
| return {func_name} |
| """) |
| |
| |
| def _wrap_function(name, func): |
| option_class = _get_options_class(func) |
| arg_names = _get_arg_names(func) |
| args_sig = ', '.join(arg_names) |
| kwonly = '' if arg_names[-1].startswith('*') else ', *' |
| |
| # Generate templated wrapper, so that the signature matches |
| # the documented argument names. |
| ns = {} |
| if option_class is not None: |
| template = _wrapper_options_template |
| else: |
| template = _wrapper_template |
| exec(template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), |
| globals(), ns) |
| wrapper = ns['make_wrapper'](func, option_class) |
| |
| return _decorate_compute_function(wrapper, name, func, option_class) |
| |
| |
| def _make_global_functions(): |
| """ |
| Make global functions wrapping each compute function. |
| |
| Note that some of the automatically-generated wrappers may be overriden |
| by custom versions below. |
| """ |
| g = globals() |
| reg = function_registry() |
| |
| # Avoid clashes with Python keywords |
| rewrites = {'and': 'and_', |
| 'or': 'or_'} |
| |
| for cpp_name in reg.list_functions(): |
| name = rewrites.get(cpp_name, cpp_name) |
| func = reg.get_function(cpp_name) |
| assert name not in g, name |
| g[cpp_name] = g[name] = _wrap_function(name, func) |
| |
| |
| _make_global_functions() |
| |
| |
| def cast(arr, target_type, safe=True): |
| """ |
| Cast array values to another data type. Can also be invoked as an array |
| instance method. |
| |
| Parameters |
| ---------- |
| arr : Array or ChunkedArray |
| target_type : DataType or type string alias |
| Type to cast to |
| safe : bool, default True |
| Check for overflows or other unsafe conversions |
| |
| Examples |
| -------- |
| >>> from datetime import datetime |
| >>> import pyarrow as pa |
| >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) |
| >>> arr.type |
| TimestampType(timestamp[us]) |
| |
| You can use ``pyarrow.DataType`` objects to specify the target type: |
| |
| >>> cast(arr, pa.timestamp('ms')) |
| <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910> |
| [ |
| 2010-01-01 00:00:00.000, |
| 2015-01-01 00:00:00.000 |
| ] |
| |
| >>> cast(arr, pa.timestamp('ms')).type |
| TimestampType(timestamp[ms]) |
| |
| Alternatively, it is also supported to use the string aliases for these |
| types: |
| |
| >>> arr.cast('timestamp[ms]') |
| <pyarrow.lib.TimestampArray object at 0x10420eb88> |
| [ |
| 1262304000000, |
| 1420070400000 |
| ] |
| >>> arr.cast('timestamp[ms]').type |
| TimestampType(timestamp[ms]) |
| |
| Returns |
| ------- |
| casted : Array |
| """ |
| if target_type is None: |
| raise ValueError("Cast target type must not be None") |
| if safe: |
| options = CastOptions.safe(target_type) |
| else: |
| options = CastOptions.unsafe(target_type) |
| return call_function("cast", [arr], options) |
| |
| |
| def match_substring(array, pattern): |
| """ |
| Test if substring *pattern* is contained within a value of a string array. |
| |
| Parameters |
| ---------- |
| array : pyarrow.Array or pyarrow.ChunkedArray |
| pattern : str |
| pattern to search for exact matches |
| |
| Returns |
| ------- |
| result : pyarrow.Array or pyarrow.ChunkedArray |
| """ |
| return call_function("match_substring", [array], |
| MatchSubstringOptions(pattern)) |
| |
| |
| def match_substring_regex(array, pattern): |
| """ |
| Test if regex *pattern* matches at any position a value of a string array. |
| |
| Parameters |
| ---------- |
| array : pyarrow.Array or pyarrow.ChunkedArray |
| pattern : str |
| regex pattern to search |
| |
| Returns |
| ------- |
| result : pyarrow.Array or pyarrow.ChunkedArray |
| """ |
| return call_function("match_substring_regex", [array], |
| MatchSubstringOptions(pattern)) |
| |
| |
| def sum(array): |
| """ |
| Sum the values in a numerical (chunked) array. |
| |
| Parameters |
| ---------- |
| array : pyarrow.Array or pyarrow.ChunkedArray |
| |
| Returns |
| ------- |
| sum : pyarrow.Scalar |
| """ |
| return call_function('sum', [array]) |
| |
| |
| def mode(array, n=1): |
| """ |
| Return top-n most common values and number of times they occur in a passed |
| numerical (chunked) array, in descending order of occurance. If there are |
| more than one values with same count, smaller one is returned first. |
| |
| Parameters |
| ---------- |
| array : pyarrow.Array or pyarrow.ChunkedArray |
| |
| Returns |
| ------- |
| An array of <input type "Mode", int64_t "Count"> structs |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> import pyarrow.compute as pc |
| >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) |
| >>> modes = pc.mode(arr, 2) |
| >>> modes[0] |
| <pyarrow.StructScalar: {'mode': 2, 'count': 5}> |
| >>> modes[1] |
| <pyarrow.StructScalar: {'mode': 1, 'count': 2}> |
| """ |
| options = ModeOptions(n=n) |
| return call_function("mode", [array], options) |
| |
| |
| def filter(data, mask, null_selection_behavior='drop'): |
| """ |
| Select values (or records) from array- or table-like data given boolean |
| filter, where true values are selected. |
| |
| Parameters |
| ---------- |
| data : Array, ChunkedArray, RecordBatch, or Table |
| mask : Array, ChunkedArray |
| Must be of boolean type |
| null_selection_behavior : str, default 'drop' |
| Configure the behavior on encountering a null slot in the mask. |
| Allowed values are 'drop' and 'emit_null'. |
| |
| - 'drop': nulls will be treated as equivalent to False. |
| - 'emit_null': nulls will result in a null in the output. |
| |
| Returns |
| ------- |
| result : depends on inputs |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> arr = pa.array(["a", "b", "c", None, "e"]) |
| >>> mask = pa.array([True, False, None, False, True]) |
| >>> arr.filter(mask) |
| <pyarrow.lib.StringArray object at 0x7fa826df9200> |
| [ |
| "a", |
| "e" |
| ] |
| >>> arr.filter(mask, null_selection_behavior='emit_null') |
| <pyarrow.lib.StringArray object at 0x7fa826df9200> |
| [ |
| "a", |
| null, |
| "e" |
| ] |
| """ |
| options = FilterOptions(null_selection_behavior) |
| return call_function('filter', [data, mask], options) |
| |
| |
| def take(data, indices, *, boundscheck=True, memory_pool=None): |
| """ |
| Select values (or records) from array- or table-like data given integer |
| selection indices. |
| |
| The result will be of the same type(s) as the input, with elements taken |
| from the input array (or record batch / table fields) at the given |
| indices. If an index is null then the corresponding value in the output |
| will be null. |
| |
| Parameters |
| ---------- |
| data : Array, ChunkedArray, RecordBatch, or Table |
| indices : Array, ChunkedArray |
| Must be of integer type |
| boundscheck : boolean, default True |
| Whether to boundscheck the indices. If False and there is an out of |
| bounds index, will likely cause the process to crash. |
| |
| Returns |
| ------- |
| result : depends on inputs |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) |
| >>> indices = pa.array([0, None, 4, 3]) |
| >>> arr.take(indices) |
| <pyarrow.lib.StringArray object at 0x7ffa4fc7d368> |
| [ |
| "a", |
| null, |
| "e", |
| null |
| ] |
| """ |
| options = TakeOptions(boundscheck=boundscheck) |
| return call_function('take', [data, indices], options, memory_pool) |
| |
| |
| def fill_null(values, fill_value): |
| """ |
| Replace each null element in values with fill_value. The fill_value must be |
| the same type as values or able to be implicitly casted to the array's |
| type. |
| |
| Parameters |
| ---------- |
| data : Array, ChunkedArray |
| replace each null element with fill_value |
| fill_value: Scalar-like object |
| Either a pyarrow.Scalar or any python object coercible to a |
| Scalar. If not same type as data will attempt to cast. |
| |
| Returns |
| ------- |
| result : depends on inputs |
| |
| Examples |
| -------- |
| >>> import pyarrow as pa |
| >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) |
| >>> fill_value = pa.scalar(5, type=pa.int8()) |
| >>> arr.fill_null(fill_value) |
| pyarrow.lib.Int8Array object at 0x7f95437f01a0> |
| [ |
| 1, |
| 2, |
| 5, |
| 3 |
| ] |
| """ |
| if not isinstance(fill_value, pa.Scalar): |
| fill_value = pa.scalar(fill_value, type=values.type) |
| elif values.type != fill_value.type: |
| fill_value = pa.scalar(fill_value.as_py(), type=values.type) |
| |
| return call_function("fill_null", [values, fill_value]) |