python/pyarrow/compute.py - arrow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.


 from pyarrow._compute import (  # noqa
     FilterOptions,
     Function,
     FunctionRegistry,
     function_registry,
     call_function,
     TakeOptions
 )

 import pyarrow as pa
 import pyarrow._compute as _pc


 def cast(arr, target_type, safe=True):
     """
     Cast array values to another data type. Can also be invoked as an array
     instance method.

     Parameters
     ----------
     arr : Array or ChunkedArray
     target_type : DataType or type string alias
         Type to cast to
     safe : bool, default True
         Check for overflows or other unsafe conversions

     Examples
     --------
     >>> from datetime import datetime
     >>> import pyarrow as pa
     >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
     >>> arr.type
     TimestampType(timestamp[us])

     You can use ``pyarrow.DataType`` objects to specify the target type:

     >>> cast(arr, pa.timestamp('ms'))
     <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
     [
       2010-01-01 00:00:00.000,
       2015-01-01 00:00:00.000
     ]

     >>> cast(arr, pa.timestamp('ms')).type
     TimestampType(timestamp[ms])

     Alternatively, it is also supported to use the string aliases for these
     types:

     >>> arr.cast('timestamp[ms]')
     <pyarrow.lib.TimestampArray object at 0x10420eb88>
     [
       1262304000000,
       1420070400000
     ]
     >>> arr.cast('timestamp[ms]').type
     TimestampType(timestamp[ms])

     Returns
     -------
     casted : Array
     """
     if target_type is None:
         raise ValueError("Cast target type must not be None")
     if safe:
         options = _pc.CastOptions.safe(target_type)
     else:
         options = _pc.CastOptions.unsafe(target_type)
     return call_function("cast", [arr], options)


 def _decorate_compute_function(func, name, *, arity):
     func.__arrow_compute_function__ = dict(name=name, arity=arity)
     return func


 def _simple_unary_function(name):
     def func(arg):
         return call_function(name, [arg])
     return _decorate_compute_function(func, name, arity=1)


 def _simple_binary_function(name):
     def func(left, right):
         return call_function(name, [left, right])
     return _decorate_compute_function(func, name, arity=2)


 binary_length = _simple_unary_function('binary_length')
 ascii_upper = _simple_unary_function('ascii_upper')
 ascii_lower = _simple_unary_function('ascii_lower')
 utf8_upper = _simple_unary_function('utf8_upper')
 utf8_lower = _simple_unary_function('utf8_lower')

 string_is_ascii = _simple_unary_function('string_is_ascii')

 ascii_is_alnum = _simple_unary_function('ascii_is_alnum')
 utf8_is_alnum = _simple_unary_function('utf8_is_alnum')
 ascii_is_alpha = _simple_unary_function('ascii_is_alpha')
 utf8_is_alpha = _simple_unary_function('utf8_is_alpha')
 ascii_is_decimal = _simple_unary_function('ascii_is_decimal')
 utf8_is_decimal = _simple_unary_function('utf8_is_decimal')
 ascii_is_digit = ascii_is_decimal  # alias
 utf8_is_digit = _simple_unary_function('utf8_is_digit')
 ascii_is_lower = _simple_unary_function('ascii_is_lower')
 utf8_is_lower = _simple_unary_function('utf8_is_lower')
 ascii_is_numeric = ascii_is_decimal  # alias
 utf8_is_numeric = _simple_unary_function('utf8_is_numeric')
 ascii_is_printable = _simple_unary_function('ascii_is_printable')
 utf8_is_printable = _simple_unary_function('utf8_is_printable')
 ascii_is_title = _simple_unary_function('ascii_is_title')
 utf8_is_title = _simple_unary_function('utf8_is_title')
 ascii_is_upper = _simple_unary_function('ascii_is_upper')
 utf8_is_upper = _simple_unary_function('utf8_is_upper')

 is_valid = _simple_unary_function('is_valid')
 is_null = _simple_unary_function('is_null')

 list_flatten = _simple_unary_function('list_flatten')
 list_parent_indices = _simple_unary_function('list_parent_indices')
 list_value_length = _simple_unary_function('list_value_length')

 add = _simple_binary_function('add')
 subtract = _simple_binary_function('subtract')
 multiply = _simple_binary_function('multiply')

 equal = _simple_binary_function('equal')
 not_equal = _simple_binary_function('not_equal')
 greater = _simple_binary_function('greater')
 greater_equal = _simple_binary_function('greater_equal')
 less = _simple_binary_function('less')
 less_equal = _simple_binary_function('less_equal')


 def match_substring(array, pattern):
     """
     Test if substring *pattern* is contained within a value of a string array.

     Parameters
     ----------
     array : pyarrow.Array or pyarrow.ChunkedArray
     pattern : str
         pattern to search for exact matches

     Returns
     -------
     result : pyarrow.Array or pyarrow.ChunkedArray
     """
     return call_function("match_substring", [array],
                          _pc.MatchSubstringOptions(pattern))


 def sum(array):
     """
     Sum the values in a numerical (chunked) array.

     Parameters
     ----------
     array : pyarrow.Array or pyarrow.ChunkedArray

     Returns
     -------
     sum : pyarrow.Scalar
     """
     return call_function('sum', [array])


 def filter(data, mask, null_selection_behavior='drop'):
     """
     Select values (or records) from array- or table-like data given boolean
     filter, where true values are selected.

     Parameters
     ----------
     data : Array, ChunkedArray, RecordBatch, or Table
     mask : Array, ChunkedArray
         Must be of boolean type
     null_selection_behavior : str, default 'drop'
         Configure the behavior on encountering a null slot in the mask.
         Allowed values are 'drop' and 'emit_null'.

         - 'drop': nulls will be treated as equivalent to False.
         - 'emit_null': nulls will result in a null in the output.

     Returns
     -------
     result : depends on inputs

     Examples
     --------
     >>> import pyarrow as pa
     >>> arr = pa.array(["a", "b", "c", None, "e"])
     >>> mask = pa.array([True, False, None, False, True])
     >>> arr.filter(mask)
     <pyarrow.lib.StringArray object at 0x7fa826df9200>
     [
       "a",
       "e"
     ]
     >>> arr.filter(mask, null_selection_behavior='emit_null')
     <pyarrow.lib.StringArray object at 0x7fa826df9200>
     [
       "a",
       null,
       "e"
     ]
     """
     options = FilterOptions(null_selection_behavior)
     return call_function('filter', [data, mask], options)


 def take(data, indices, boundscheck=True):
     """
     Select values (or records) from array- or table-like data given integer
     selection indices.

     The result will be of the same type(s) as the input, with elements taken
     from the input array (or record batch / table fields) at the given
     indices. If an index is null then the corresponding value in the output
     will be null.

     Parameters
     ----------
     data : Array, ChunkedArray, RecordBatch, or Table
     indices : Array, ChunkedArray
         Must be of integer type
     boundscheck : boolean, default True
         Whether to boundscheck the indices. If False and there is an out of
         bounds index, will likely cause the process to crash.

     Returns
     -------
     result : depends on inputs

     Examples
     --------
     >>> import pyarrow as pa
     >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
     >>> indices = pa.array([0, None, 4, 3])
     >>> arr.take(indices)
     <pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
     [
       "a",
       null,
       "e",
       null
     ]
     """
     options = TakeOptions(boundscheck)
     return call_function('take', [data, indices], options)


 def fill_null(values, fill_value):
     """
     Replace each null element in values with fill_value. The fill_value must be
     the same type as values or able to be implicitly casted to the array's
     type.

     Parameters
     ----------
     data : Array, ChunkedArray
         replace each null element with fill_value
     fill_value: Scalar-like object
         Either a pyarrow.Scalar or any python object coercible to a
         Scalar. If not same type as data will attempt to cast.

     Returns
     -------
     result : depends on inputs

     Examples
     --------
     >>> import pyarrow as pa
     >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
     >>> fill_value = pa.scalar(5, type=pa.int8())
     >>> arr.fill_null(fill_value)
     pyarrow.lib.Int8Array object at 0x7f95437f01a0>
     [
       1,
       2,
       5,
       3
     ]
     """
     if not isinstance(fill_value, pa.Scalar):
         fill_value = pa.scalar(fill_value, type=values.type)
     elif values.type != fill_value.type:
         fill_value = pa.scalar(fill_value.as_py(), type=values.type)

     return call_function("fill_null", [values, fill_value])
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.


	from pyarrow._compute import ( # noqa
	FilterOptions,
	Function,
	FunctionRegistry,
	function_registry,
	call_function,
	TakeOptions
	)

	import pyarrow as pa
	import pyarrow._compute as _pc


	def cast(arr, target_type, safe=True):
	"""
	Cast array values to another data type. Can also be invoked as an array
	instance method.

	Parameters
	----------
	arr : Array or ChunkedArray
	target_type : DataType or type string alias
	Type to cast to
	safe : bool, default True
	Check for overflows or other unsafe conversions

	Examples
	--------
	>>> from datetime import datetime
	>>> import pyarrow as pa
	>>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
	>>> arr.type
	TimestampType(timestamp[us])

	You can use ``pyarrow.DataType`` objects to specify the target type:

	>>> cast(arr, pa.timestamp('ms'))
	<pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
	[
	2010-01-01 00:00:00.000,
	2015-01-01 00:00:00.000
	]

	>>> cast(arr, pa.timestamp('ms')).type
	TimestampType(timestamp[ms])

	Alternatively, it is also supported to use the string aliases for these
	types:

	>>> arr.cast('timestamp[ms]')
	<pyarrow.lib.TimestampArray object at 0x10420eb88>
	[
	1262304000000,
	1420070400000
	]
	>>> arr.cast('timestamp[ms]').type
	TimestampType(timestamp[ms])

	Returns
	-------
	casted : Array
	"""
	if target_type is None:
	raise ValueError("Cast target type must not be None")
	if safe:
	options = _pc.CastOptions.safe(target_type)
	else:
	options = _pc.CastOptions.unsafe(target_type)
	return call_function("cast", [arr], options)


	def _decorate_compute_function(func, name, *, arity):
	func.__arrow_compute_function__ = dict(name=name, arity=arity)
	return func


	def _simple_unary_function(name):
	def func(arg):
	return call_function(name, [arg])
	return _decorate_compute_function(func, name, arity=1)


	def _simple_binary_function(name):
	def func(left, right):
	return call_function(name, [left, right])
	return _decorate_compute_function(func, name, arity=2)


	binary_length = _simple_unary_function('binary_length')
	ascii_upper = _simple_unary_function('ascii_upper')
	ascii_lower = _simple_unary_function('ascii_lower')
	utf8_upper = _simple_unary_function('utf8_upper')
	utf8_lower = _simple_unary_function('utf8_lower')

	string_is_ascii = _simple_unary_function('string_is_ascii')

	ascii_is_alnum = _simple_unary_function('ascii_is_alnum')
	utf8_is_alnum = _simple_unary_function('utf8_is_alnum')
	ascii_is_alpha = _simple_unary_function('ascii_is_alpha')
	utf8_is_alpha = _simple_unary_function('utf8_is_alpha')
	ascii_is_decimal = _simple_unary_function('ascii_is_decimal')
	utf8_is_decimal = _simple_unary_function('utf8_is_decimal')
	ascii_is_digit = ascii_is_decimal # alias
	utf8_is_digit = _simple_unary_function('utf8_is_digit')
	ascii_is_lower = _simple_unary_function('ascii_is_lower')
	utf8_is_lower = _simple_unary_function('utf8_is_lower')
	ascii_is_numeric = ascii_is_decimal # alias
	utf8_is_numeric = _simple_unary_function('utf8_is_numeric')
	ascii_is_printable = _simple_unary_function('ascii_is_printable')
	utf8_is_printable = _simple_unary_function('utf8_is_printable')
	ascii_is_title = _simple_unary_function('ascii_is_title')
	utf8_is_title = _simple_unary_function('utf8_is_title')
	ascii_is_upper = _simple_unary_function('ascii_is_upper')
	utf8_is_upper = _simple_unary_function('utf8_is_upper')

	is_valid = _simple_unary_function('is_valid')
	is_null = _simple_unary_function('is_null')

	list_flatten = _simple_unary_function('list_flatten')
	list_parent_indices = _simple_unary_function('list_parent_indices')
	list_value_length = _simple_unary_function('list_value_length')

	add = _simple_binary_function('add')
	subtract = _simple_binary_function('subtract')
	multiply = _simple_binary_function('multiply')

	equal = _simple_binary_function('equal')
	not_equal = _simple_binary_function('not_equal')
	greater = _simple_binary_function('greater')
	greater_equal = _simple_binary_function('greater_equal')
	less = _simple_binary_function('less')
	less_equal = _simple_binary_function('less_equal')


	def match_substring(array, pattern):
	"""
	Test if substring pattern is contained within a value of a string array.

	Parameters
	----------
	array : pyarrow.Array or pyarrow.ChunkedArray
	pattern : str
	pattern to search for exact matches

	Returns
	-------
	result : pyarrow.Array or pyarrow.ChunkedArray
	"""
	return call_function("match_substring", [array],
	_pc.MatchSubstringOptions(pattern))


	def sum(array):
	"""
	Sum the values in a numerical (chunked) array.

	Parameters
	----------
	array : pyarrow.Array or pyarrow.ChunkedArray

	Returns
	-------
	sum : pyarrow.Scalar
	"""
	return call_function('sum', [array])


	def filter(data, mask, null_selection_behavior='drop'):
	"""
	Select values (or records) from array- or table-like data given boolean
	filter, where true values are selected.

	Parameters
	----------
	data : Array, ChunkedArray, RecordBatch, or Table
	mask : Array, ChunkedArray
	Must be of boolean type
	null_selection_behavior : str, default 'drop'
	Configure the behavior on encountering a null slot in the mask.
	Allowed values are 'drop' and 'emit_null'.

	- 'drop': nulls will be treated as equivalent to False.
	- 'emit_null': nulls will result in a null in the output.

	Returns
	-------
	result : depends on inputs

	Examples
	--------
	>>> import pyarrow as pa
	>>> arr = pa.array(["a", "b", "c", None, "e"])
	>>> mask = pa.array([True, False, None, False, True])
	>>> arr.filter(mask)
	<pyarrow.lib.StringArray object at 0x7fa826df9200>
	[
	"a",
	"e"
	]
	>>> arr.filter(mask, null_selection_behavior='emit_null')
	<pyarrow.lib.StringArray object at 0x7fa826df9200>
	[
	"a",
	null,
	"e"
	]
	"""
	options = FilterOptions(null_selection_behavior)
	return call_function('filter', [data, mask], options)


	def take(data, indices, boundscheck=True):
	"""
	Select values (or records) from array- or table-like data given integer
	selection indices.

	The result will be of the same type(s) as the input, with elements taken
	from the input array (or record batch / table fields) at the given
	indices. If an index is null then the corresponding value in the output
	will be null.

	Parameters
	----------
	data : Array, ChunkedArray, RecordBatch, or Table
	indices : Array, ChunkedArray
	Must be of integer type
	boundscheck : boolean, default True
	Whether to boundscheck the indices. If False and there is an out of
	bounds index, will likely cause the process to crash.

	Returns
	-------
	result : depends on inputs

	Examples
	--------
	>>> import pyarrow as pa
	>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
	>>> indices = pa.array([0, None, 4, 3])
	>>> arr.take(indices)
	<pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
	[
	"a",
	null,
	"e",
	null
	]
	"""
	options = TakeOptions(boundscheck)
	return call_function('take', [data, indices], options)


	def fill_null(values, fill_value):
	"""
	Replace each null element in values with fill_value. The fill_value must be
	the same type as values or able to be implicitly casted to the array's
	type.

	Parameters
	----------
	data : Array, ChunkedArray
	replace each null element with fill_value
	fill_value: Scalar-like object
	Either a pyarrow.Scalar or any python object coercible to a
	Scalar. If not same type as data will attempt to cast.

	Returns
	-------
	result : depends on inputs

	Examples
	--------
	>>> import pyarrow as pa
	>>> arr = pa.array([1, 2, None, 3], type=pa.int8())
	>>> fill_value = pa.scalar(5, type=pa.int8())
	>>> arr.fill_null(fill_value)
	pyarrow.lib.Int8Array object at 0x7f95437f01a0>
	[
	1,
	2,
	5,
	3
	]
	"""
	if not isinstance(fill_value, pa.Scalar):
	fill_value = pa.scalar(fill_value, type=values.type)
	elif values.type != fill_value.type:
	fill_value = pa.scalar(fill_value.as_py(), type=values.type)

	return call_function("fill_null", [values, fill_value])