blob: ca118ec56780aaa1d8ffc5b15b7ebdc4fed23505 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Eager evaluation convenience APIs for invoking common functions, including
// necessary memory allocations
#pragma once
#include "arrow/compute/function.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
class Array;
namespace compute {
class ExecContext;
// ----------------------------------------------------------------------
// Aggregate functions
/// \addtogroup compute-concrete-options
/// @{
/// \brief Control Count kernel behavior
///
/// By default, all non-null values are counted.
struct ARROW_EXPORT CountOptions : public FunctionOptions {
enum Mode {
/// Count all non-null values.
COUNT_NON_NULL = 0,
/// Count all null values.
COUNT_NULL,
};
explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {}
static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
enum Mode count_mode;
};
/// \brief Control MinMax kernel behavior
///
/// By default, null values are ignored
struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
enum Mode {
/// Skip null values
SKIP = 0,
/// Any nulls will result in null output
EMIT_NULL
};
explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {}
static MinMaxOptions Defaults() { return MinMaxOptions{}; }
enum Mode null_handling;
};
/// \brief Control Mode kernel behavior
///
/// Returns top-n common values and counts.
/// By default, returns the most common value and count.
struct ARROW_EXPORT ModeOptions : public FunctionOptions {
explicit ModeOptions(int64_t n = 1) : n(n) {}
static ModeOptions Defaults() { return ModeOptions{}; }
int64_t n = 1;
};
/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
///
/// The divisor used in calculations is N - ddof, where N is the number of elements.
/// By default, ddof is zero, and population variance or stddev is returned.
struct ARROW_EXPORT VarianceOptions : public FunctionOptions {
explicit VarianceOptions(int ddof = 0) : ddof(ddof) {}
static VarianceOptions Defaults() { return VarianceOptions{}; }
int ddof = 0;
};
/// \brief Control Quantile kernel behavior
///
/// By default, returns the median value.
struct ARROW_EXPORT QuantileOptions : public FunctionOptions {
/// Interpolation method to use when quantile lies between two data points
enum Interpolation {
LINEAR = 0,
LOWER,
HIGHER,
NEAREST,
MIDPOINT,
};
explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR)
: q{q}, interpolation{interpolation} {}
explicit QuantileOptions(std::vector<double> q,
enum Interpolation interpolation = LINEAR)
: q{std::move(q)}, interpolation{interpolation} {}
static QuantileOptions Defaults() { return QuantileOptions{}; }
/// quantile must be between 0 and 1 inclusive
std::vector<double> q;
enum Interpolation interpolation;
};
/// \brief Control TDigest approximate quantile kernel behavior
///
/// By default, returns the median value.
struct ARROW_EXPORT TDigestOptions : public FunctionOptions {
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
uint32_t buffer_size = 500)
: q{q}, delta{delta}, buffer_size{buffer_size} {}
explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
uint32_t buffer_size = 500)
: q{std::move(q)}, delta{delta}, buffer_size{buffer_size} {}
static TDigestOptions Defaults() { return TDigestOptions{}; }
/// quantile must be between 0 and 1 inclusive
std::vector<double> q;
/// compression parameter, default 100
uint32_t delta;
/// input buffer size, default 500
uint32_t buffer_size;
};
/// @}
/// \brief Count non-null (or null) values in an array.
///
/// \param[in] options counting options, see CountOptions for more information
/// \param[in] datum to count
/// \param[in] ctx the function execution context, optional
/// \return out resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Count(const Datum& datum, CountOptions options = CountOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Compute the mean of a numeric array.
///
/// \param[in] value datum to compute the mean, expecting Array
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed mean as a DoubleScalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mean(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Sum values of a numeric array.
///
/// \param[in] value datum to sum, expecting Array or ChunkedArray
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Calculate the min / max of a numeric array
///
/// This function returns both the min and max as a struct scalar, with type
/// struct<min: T, max: T>, where T is the input type
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see MinMaxOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a struct<min: T, max: T> scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> MinMax(const Datum& value,
const MinMaxOptions& options = MinMaxOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Test whether any element in a boolean array evaluates to true.
///
/// This function returns true if any of the elements in the array evaluates
/// to true and false otherwise. Null values are skipped.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar
///
/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Any(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Test whether all elements in a boolean array evaluate to true.
///
/// This function returns true if all of the elements in the array evaluate
/// to true and false otherwise. Null values are skipped.
///
/// \param[in] value input datum, expecting a boolean array
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a BooleanScalar
/// \since 3.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> All(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Calculate the modal (most common) value of a numeric array
///
/// This function returns top-n most common values and number of times they occur as
/// an array of `struct<mode: T, count: int64>`, where T is the input type.
/// Values with larger counts are returned before smaller ones.
/// If there are more than one values with same count, smaller value is returned first.
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see ModeOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array of struct<mode: T, count: int64>
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Mode(const Datum& value,
const ModeOptions& options = ModeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the standard deviation of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed standard deviation as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Stddev(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the variance of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see VarianceOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed variance as a DoubleScalar
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Variance(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the quantiles of a numeric array
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see QuantileOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> Quantile(const Datum& value,
const QuantileOptions& options = QuantileOptions::Defaults(),
ExecContext* ctx = NULLPTR);
/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
///
/// \param[in] value input datum, expecting Array or ChunkedArray
/// \param[in] options see TDigestOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as an array
///
/// \since 4.0.0
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> TDigest(const Datum& value,
const TDigestOptions& options = TDigestOptions::Defaults(),
ExecContext* ctx = NULLPTR);
namespace internal {
/// Internal use only: streaming group identifier.
/// Consumes batches of keys and yields batches of the group ids.
class ARROW_EXPORT Grouper {
public:
virtual ~Grouper() = default;
/// Construct a Grouper which receives the specified key types
static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
ExecContext* ctx = default_exec_context());
/// Consume a batch of keys, producing the corresponding group ids as an integer array.
/// Currently only uint32 indices will be produced, eventually the bit width will only
/// be as wide as necessary.
virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
/// Get current unique keys. May be called multiple times.
virtual Result<ExecBatch> GetUniques() = 0;
/// Get the current number of groups.
virtual uint32_t num_groups() const = 0;
/// \brief Assemble lists of indices of identical elements.
///
/// \param[in] ids An unsigned, all-valid integral array which will be
/// used as grouping criteria.
/// \param[in] num_groups An upper bound for the elements of ids
/// \return A num_groups-long ListArray where the slot at i contains a
/// list of indices where i appears in ids.
///
/// MakeGroupings([
/// 2,
/// 2,
/// 5,
/// 5,
/// 2,
/// 3
/// ], 8) == [
/// [],
/// [],
/// [0, 1, 4],
/// [5],
/// [],
/// [2, 3],
/// [],
/// []
/// ]
static Result<std::shared_ptr<ListArray>> MakeGroupings(
const UInt32Array& ids, uint32_t num_groups,
ExecContext* ctx = default_exec_context());
/// \brief Produce a ListArray whose slots are selections of `array` which correspond to
/// the provided groupings.
///
/// For example,
/// ApplyGroupings([
/// [],
/// [],
/// [0, 1, 4],
/// [5],
/// [],
/// [2, 3],
/// [],
/// []
/// ], [2, 2, 5, 5, 2, 3]) == [
/// [],
/// [],
/// [2, 2, 2],
/// [3],
/// [],
/// [5, 5],
/// [],
/// []
/// ]
static Result<std::shared_ptr<ListArray>> ApplyGroupings(
const ListArray& groupings, const Array& array,
ExecContext* ctx = default_exec_context());
};
/// \brief Configure a grouped aggregation
struct ARROW_EXPORT Aggregate {
/// the name of the aggregation function
std::string function;
/// options for the aggregation function
const FunctionOptions* options;
};
/// Internal use only: helper function for testing HashAggregateKernels.
/// This will be replaced by streaming execution operators.
ARROW_EXPORT
Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
const std::vector<Aggregate>& aggregates,
ExecContext* ctx = default_exec_context());
} // namespace internal
} // namespace compute
} // namespace arrow