blob: d67568e15671a6cbd50080e7b3c5c7118159e648 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "arrow/compute/function.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/type_fwd.h"
namespace arrow {
namespace compute {
class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
struct FilterOptions : public FunctionOptions {
/// Configure the action taken when a slot of the selection mask is null
enum NullSelectionBehavior {
/// the corresponding filtered value will be removed in the output
DROP,
/// the corresponding filtered value will be null in the output
EMIT_NULL,
};
explicit FilterOptions(NullSelectionBehavior null_selection = DROP)
: null_selection_behavior(null_selection) {}
static FilterOptions Defaults() { return FilterOptions(); }
NullSelectionBehavior null_selection_behavior = DROP;
};
struct ARROW_EXPORT TakeOptions : public FunctionOptions {
explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {}
bool boundscheck = true;
static TakeOptions BoundsCheck() { return TakeOptions(true); }
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
static TakeOptions Defaults() { return BoundsCheck(); }
};
enum class SortOrder {
Ascending,
Descending,
};
/// \brief Options for the dictionary encode function
struct DictionaryEncodeOptions : public FunctionOptions {
/// Configure how null values will be encoded
enum NullEncodingBehavior {
/// the null value will be added to the dictionary with a proper index
ENCODE,
/// the null value will be masked in the indices array
MASK
};
explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK)
: null_encoding_behavior(null_encoding) {}
static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
NullEncodingBehavior null_encoding_behavior = MASK;
};
/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
struct ARROW_EXPORT SortKey {
explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
: name(name), order(order) {}
/// The name of the sort column.
std::string name;
/// How to order by this sort key.
SortOrder order;
};
struct ARROW_EXPORT ArraySortOptions : public FunctionOptions {
explicit ArraySortOptions(SortOrder order = SortOrder::Ascending) : order(order) {}
static ArraySortOptions Defaults() { return ArraySortOptions{}; }
SortOrder order;
};
struct ARROW_EXPORT SortOptions : public FunctionOptions {
explicit SortOptions(std::vector<SortKey> sort_keys = {}) : sort_keys(sort_keys) {}
static SortOptions Defaults() { return SortOptions{}; }
std::vector<SortKey> sort_keys;
};
/// \brief Partitioning options for NthToIndices
struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {}
/// The index into the equivalent sorted array of the partition pivot element.
int64_t pivot;
};
/// @}
/// \brief Filter with a boolean selection filter
///
/// The output will be populated with values from the input at positions
/// where the selection filter is not 0. Nulls in the filter will be handled
/// based on options.null_selection_behavior.
///
/// For example given values = ["a", "b", "c", null, "e", "f"] and
/// filter = [0, 1, 1, 0, null, 1], the output will be
/// (null_selection_behavior == DROP) = ["b", "c", "f"]
/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
///
/// \param[in] values array to filter
/// \param[in] filter indicates which values should be filtered out
/// \param[in] options configures null_selection_behavior
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> Filter(const Datum& values, const Datum& filter,
const FilterOptions& options = FilterOptions::Defaults(),
ExecContext* ctx = NULLPTR);
namespace internal {
// These internal functions are implemented in kernels/vector_selection.cc
/// \brief Return the number of selected indices in the boolean filter
ARROW_EXPORT
int64_t GetFilterOutputSize(const ArrayData& filter,
FilterOptions::NullSelectionBehavior null_selection);
/// \brief Compute uint64 selection indices for use with Take given a boolean
/// filter
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
MemoryPool* memory_pool = default_memory_pool());
} // namespace internal
/// \brief Take from an array of values at indices in another array
///
/// The output array will be of the same type as the input values
/// array, with elements taken from the values array at the given
/// indices. If an index is null then the taken element will be null.
///
/// For example given values = ["a", "b", "c", null, "e", "f"] and
/// indices = [2, 1, null, 3], the output will be
/// = [values[2], values[1], null, values[3]]
/// = ["c", "b", null, null]
///
/// \param[in] values datum from which to take
/// \param[in] indices which values to take
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
ARROW_EXPORT
Result<Datum> Take(const Datum& values, const Datum& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Take with Array inputs and output
ARROW_EXPORT
Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Returns indices that partition an array around n-th
/// sorted element.
///
/// Find index of n-th(0 based) smallest value and perform indirect
/// partition of an array around that element. Output indices[0 ~ n-1]
/// holds values no greater than n-th element, and indices[n+1 ~ end]
/// holds values no less than n-th element. Elements in each partition
/// is not sorted. Nulls will be partitioned to the end of the output.
/// Output is not guaranteed to be stable.
///
/// \param[in] values array to be partitioned
/// \param[in] n pivot array around sorted n-th element
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would partition an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
ExecContext* ctx = NULLPTR);
/// \brief Returns the indices that would sort an array in the
/// specified order.
///
/// Perform an indirect sort of array. The output array will contain
/// indices that would sort an array, which would be the same length
/// as input. Nulls will be stably partitioned to the end of the output
/// regardless of order.
///
/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
/// 3].
///
/// \param[in] array array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Array& array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);
/// \brief Returns the indices that would sort a chunked array in the
/// specified order.
///
/// Perform an indirect sort of chunked array. The output array will
/// contain indices that would sort a chunked array, which would be
/// the same length as input. Nulls will be stably partitioned to the
/// end of the output regardless of order.
///
/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
/// 4, 1, 0, 3].
///
/// \param[in] chunked_array chunked array to sort
/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
SortOrder order = SortOrder::Ascending,
ExecContext* ctx = NULLPTR);
/// \brief Returns the indices that would sort an input in the
/// specified order. Input is one of array, chunked array record batch
/// or table.
///
/// Perform an indirect sort of input. The output array will contain
/// indices that would sort an input, which would be the same length
/// as input. Nulls will be stably partitioned to the end of the
/// output regardless of order.
///
/// For example given input (table) = {
/// "column1": [[null, 1], [ 3, null, 2, 1]],
/// "column2": [[ 5], [3, null, null, 5, 5]],
/// } and options = {
/// {"column1", SortOrder::Ascending},
/// {"column2", SortOrder::Descending},
/// }, the output will be [5, 1, 4, 2, 0, 3].
///
/// \param[in] datum array, chunked array, record batch or table to sort
/// \param[in] options options
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort a table
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
ExecContext* ctx = NULLPTR);
/// \brief Compute unique elements from an array-like object
///
/// Note if a null occurs in the input it will NOT be included in the output.
///
/// \param[in] datum array-like input
/// \param[in] ctx the function execution context, optional
/// \return result as Array
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
// Constants for accessing the output of ValueCounts
ARROW_EXPORT extern const char kValuesFieldName[];
ARROW_EXPORT extern const char kCountsFieldName[];
ARROW_EXPORT extern const int32_t kValuesFieldIndex;
ARROW_EXPORT extern const int32_t kCountsFieldIndex;
/// \brief Return counts of unique elements from an array-like object.
///
/// Note that the counts do not include counts for nulls in the array. These can be
/// obtained separately from metadata.
///
/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
/// which can lead to unexpected results if the input Array has these values.
///
/// \param[in] value array-like input
/// \param[in] ctx the function execution context, optional
/// \return counts An array of <input type "Values", int64_t "Counts"> structs.
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
ExecContext* ctx = NULLPTR);
/// \brief Dictionary-encode values in an array-like object
///
/// Any nulls encountered in the dictionary will be handled according to the
/// specified null encoding behavior.
///
/// For example, given values ["a", "b", null, "a", null] the output will be
/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
///
/// If the input is already dictionary encoded this function is a no-op unless
/// it needs to modify the null_encoding (TODO)
///
/// \param[in] data array-like input
/// \param[in] ctx the function execution context, optional
/// \param[in] options configures null encoding behavior
/// \return result with same shape and type as input
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> DictionaryEncode(
const Datum& data,
const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
// ----------------------------------------------------------------------
// Deprecated functions
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<ChunkedArray>> Take(
const ChunkedArray& values, const Array& indices,
const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<ChunkedArray>> Take(
const ChunkedArray& values, const ChunkedArray& indices,
const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<ChunkedArray>> Take(
const Array& values, const ChunkedArray& indices,
const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<RecordBatch>> Take(
const RecordBatch& batch, const Array& indices,
const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<Table>> Take(const Table& table, const Array& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version")
ARROW_EXPORT
Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indices,
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* context = NULLPTR);
ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
ARROW_EXPORT
Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
ExecContext* ctx = NULLPTR);
} // namespace compute
} // namespace arrow