blob: f0de1be27934d1d3ed8d4502fbf6605c448d63c2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cmath>
#include <vector>
#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/util_internal.h"
#include "arrow/stl_allocator.h"
namespace arrow {
namespace compute {
namespace internal {
namespace {
using QuantileState = internal::OptionsWrapper<QuantileOptions>;
// output is at some input data point, not interpolated
bool IsDataPoint(const QuantileOptions& options) {
// some interpolation methods return exact data point
return options.interpolation == QuantileOptions::LOWER ||
options.interpolation == QuantileOptions::HIGHER ||
options.interpolation == QuantileOptions::NEAREST;
}
// quantile to exact datapoint index (IsDataPoint == true)
uint64_t QuantileToDataPoint(size_t length, double q,
enum QuantileOptions::Interpolation interpolation) {
const double index = (length - 1) * q;
uint64_t datapoint_index = static_cast<uint64_t>(index);
const double fraction = index - datapoint_index;
if (interpolation == QuantileOptions::LINEAR ||
interpolation == QuantileOptions::MIDPOINT) {
DCHECK_EQ(fraction, 0);
}
// convert NEAREST interpolation method to LOWER or HIGHER
if (interpolation == QuantileOptions::NEAREST) {
if (fraction < 0.5) {
interpolation = QuantileOptions::LOWER;
} else if (fraction > 0.5) {
interpolation = QuantileOptions::HIGHER;
} else {
// round 0.5 to nearest even number, similar to numpy.around
interpolation =
(datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
}
}
if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
++datapoint_index;
}
return datapoint_index;
}
// copy and nth_element approach, large memory footprint
template <typename InType>
struct SortQuantiler {
using CType = typename InType::c_type;
using Allocator = arrow::stl::allocator<CType>;
void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const QuantileOptions& options = QuantileState::Get(ctx);
// copy all chunks to a buffer, ignore nulls and nans
std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
const Datum& datum = batch[0];
const int64_t in_length = datum.length() - datum.null_count();
if (in_length > 0) {
in_buffer.resize(in_length);
CopyNonNullValues<sizeof(CType)>(datum, in_buffer.data());
// drop nan
if (is_floating_type<InType>::value) {
const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
[](CType v) { return v != v; });
in_buffer.resize(it - in_buffer.begin());
}
}
// prepare out array
int64_t out_length = options.q.size();
if (in_buffer.empty()) {
out_length = 0; // input is empty or only contains null and nan, return empty array
}
// out type depends on options
const bool is_datapoint = IsDataPoint(options);
const std::shared_ptr<DataType> out_type =
is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
auto out_data = ArrayData::Make(out_type, out_length, 0);
out_data->buffers.resize(2, nullptr);
// calculate quantiles
if (out_length > 0) {
KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
// find quantiles in descending order
std::vector<int64_t> q_indices(out_length);
std::iota(q_indices.begin(), q_indices.end(), 0);
std::sort(q_indices.begin(), q_indices.end(),
[&options](int64_t left_index, int64_t right_index) {
return options.q[right_index] < options.q[left_index];
});
// input array is partitioned around data point at `last_index` (pivot)
// for next quatile which is smaller, we only consider inputs left of the pivot
uint64_t last_index = in_buffer.size();
if (is_datapoint) {
CType* out_buffer = out_data->template GetMutableValues<CType>(1);
for (int64_t i = 0; i < out_length; ++i) {
const int64_t q_index = q_indices[i];
out_buffer[q_index] = GetQuantileAtDataPoint(
in_buffer, &last_index, options.q[q_index], options.interpolation);
}
} else {
double* out_buffer = out_data->template GetMutableValues<double>(1);
for (int64_t i = 0; i < out_length; ++i) {
const int64_t q_index = q_indices[i];
out_buffer[q_index] = GetQuantileByInterp(
in_buffer, &last_index, options.q[q_index], options.interpolation);
}
}
}
*out = Datum(std::move(out_data));
}
// return quantile located exactly at some input data point
CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
double q,
enum QuantileOptions::Interpolation interpolation) {
const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
if (datapoint_index != *last_index) {
DCHECK_LT(datapoint_index, *last_index);
std::nth_element(in.begin(), in.begin() + datapoint_index,
in.begin() + *last_index);
*last_index = datapoint_index;
}
return in[datapoint_index];
}
// return quantile interpolated from adjacent input data points
double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
double q,
enum QuantileOptions::Interpolation interpolation) {
const double index = (in.size() - 1) * q;
const uint64_t lower_index = static_cast<uint64_t>(index);
const double fraction = index - lower_index;
if (lower_index != *last_index) {
DCHECK_LT(lower_index, *last_index);
std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
}
const double lower_value = static_cast<double>(in[lower_index]);
if (fraction == 0) {
*last_index = lower_index;
return lower_value;
}
const uint64_t higher_index = lower_index + 1;
DCHECK_LT(higher_index, in.size());
if (lower_index != *last_index && higher_index != *last_index) {
DCHECK_LT(higher_index, *last_index);
// higher value must be the minimal value after lower_index
auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
std::iter_swap(in.begin() + higher_index, min);
}
*last_index = lower_index;
const double higher_value = static_cast<double>(in[higher_index]);
if (interpolation == QuantileOptions::LINEAR) {
// more stable than naive linear interpolation
return fraction * higher_value + (1 - fraction) * lower_value;
} else if (interpolation == QuantileOptions::MIDPOINT) {
return lower_value / 2 + higher_value / 2;
} else {
DCHECK(false);
return NAN;
}
}
};
// histogram approach with constant memory, only for integers within limited value range
template <typename InType>
struct CountQuantiler {
using CType = typename InType::c_type;
CType min;
std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
// indices to adjacent non-empty bins covering current quantile
struct AdjacentBins {
int left_index;
int right_index;
uint64_t total_count; // accumulated counts till left_index (inclusive)
};
CountQuantiler(CType min, CType max) {
uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
DCHECK_LT(value_range, 1 << 30);
this->min = min;
this->counts.resize(value_range, 0);
}
void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const QuantileOptions& options = QuantileState::Get(ctx);
// count values in all chunks, ignore nulls
const Datum& datum = batch[0];
int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
// prepare out array
int64_t out_length = options.q.size();
if (in_length == 0) {
out_length = 0; // input is empty or only contains null, return empty array
}
// out type depends on options
const bool is_datapoint = IsDataPoint(options);
const std::shared_ptr<DataType> out_type =
is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
auto out_data = ArrayData::Make(out_type, out_length, 0);
out_data->buffers.resize(2, nullptr);
// calculate quantiles
if (out_length > 0) {
KERNEL_ASSIGN_OR_RAISE(out_data->buffers[1], ctx,
ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
// find quantiles in ascending order
std::vector<int64_t> q_indices(out_length);
std::iota(q_indices.begin(), q_indices.end(), 0);
std::sort(q_indices.begin(), q_indices.end(),
[&options](int64_t left_index, int64_t right_index) {
return options.q[left_index] < options.q[right_index];
});
AdjacentBins bins{0, 0, this->counts[0]};
if (is_datapoint) {
CType* out_buffer = out_data->template GetMutableValues<CType>(1);
for (int64_t i = 0; i < out_length; ++i) {
const int64_t q_index = q_indices[i];
out_buffer[q_index] = GetQuantileAtDataPoint(
in_length, &bins, options.q[q_index], options.interpolation);
}
} else {
double* out_buffer = out_data->template GetMutableValues<double>(1);
for (int64_t i = 0; i < out_length; ++i) {
const int64_t q_index = q_indices[i];
out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
options.interpolation);
}
}
}
*out = Datum(std::move(out_data));
}
// return quantile located exactly at some input data point
CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
enum QuantileOptions::Interpolation interpolation) {
const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
while (datapoint_index >= bins->total_count &&
static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
++bins->left_index;
bins->total_count += this->counts[bins->left_index];
}
DCHECK_LT(datapoint_index, bins->total_count);
return static_cast<CType>(bins->left_index + this->min);
}
// return quantile interpolated from adjacent input data points
double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
enum QuantileOptions::Interpolation interpolation) {
const double index = (in_length - 1) * q;
const uint64_t index_floor = static_cast<uint64_t>(index);
const double fraction = index - index_floor;
while (index_floor >= bins->total_count &&
static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
++bins->left_index;
bins->total_count += this->counts[bins->left_index];
}
DCHECK_LT(index_floor, bins->total_count);
const double lower_value = static_cast<double>(bins->left_index + this->min);
// quantile lies in this bin, no interpolation needed
if (index <= bins->total_count - 1) {
return lower_value;
}
// quantile lies across two bins, locate next bin if not already done
DCHECK_EQ(index_floor, bins->total_count - 1);
if (bins->right_index <= bins->left_index) {
bins->right_index = bins->left_index + 1;
while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
this->counts[bins->right_index] == 0) {
++bins->right_index;
}
}
DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
DCHECK_GT(this->counts[bins->right_index], 0);
const double higher_value = static_cast<double>(bins->right_index + this->min);
if (interpolation == QuantileOptions::LINEAR) {
return fraction * higher_value + (1 - fraction) * lower_value;
} else if (interpolation == QuantileOptions::MIDPOINT) {
return lower_value / 2 + higher_value / 2;
} else {
DCHECK(false);
return NAN;
}
}
};
// histogram or 'copy & nth_element' approach per value range and size, only for integers
template <typename InType>
struct CountOrSortQuantiler {
using CType = typename InType::c_type;
void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// cross point to benefit from histogram approach
// parameters estimated from ad-hoc benchmarks manually
static constexpr int kMinArraySize = 65536;
static constexpr int kMaxValueRange = 65536;
const Datum& datum = batch[0];
if (datum.length() - datum.null_count() >= kMinArraySize) {
CType min, max;
std::tie(min, max) = GetMinMax<CType>(datum);
if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
return;
}
}
SortQuantiler<InType>().Exec(ctx, batch, out);
}
};
template <typename InType, typename Enable = void>
struct ExactQuantiler;
template <>
struct ExactQuantiler<UInt8Type> {
CountQuantiler<UInt8Type> impl;
ExactQuantiler() : impl(0, 255) {}
};
template <>
struct ExactQuantiler<Int8Type> {
CountQuantiler<Int8Type> impl;
ExactQuantiler() : impl(-128, 127) {}
};
template <typename InType>
struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
(sizeof(typename InType::c_type) > 1))>> {
CountOrSortQuantiler<InType> impl;
};
template <typename InType>
struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
SortQuantiler<InType> impl;
};
template <typename _, typename InType>
struct QuantileExecutor {
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (ctx->state() == nullptr) {
ctx->SetStatus(Status::Invalid("Quantile requires QuantileOptions"));
return;
}
const QuantileOptions& options = QuantileState::Get(ctx);
if (options.q.empty()) {
ctx->SetStatus(Status::Invalid("Requires quantile argument"));
return;
}
for (double q : options.q) {
if (q < 0 || q > 1) {
ctx->SetStatus(Status::Invalid("Quantile must be between 0 and 1"));
return;
}
}
ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
}
};
Result<ValueDescr> ResolveOutput(KernelContext* ctx,
const std::vector<ValueDescr>& args) {
const QuantileOptions& options = QuantileState::Get(ctx);
if (IsDataPoint(options)) {
return ValueDescr::Array(args[0].type);
} else {
return ValueDescr::Array(float64());
}
}
void AddQuantileKernels(VectorFunction* func) {
VectorKernel base;
base.init = QuantileState::Init;
base.can_execute_chunkwise = false;
base.output_chunked = false;
for (const auto& ty : NumericTypes()) {
base.signature =
KernelSignature::Make({InputType::Array(ty)}, OutputType(ResolveOutput));
// output type is determined at runtime, set template argument to nulltype
base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
DCHECK_OK(func->AddKernel(base));
}
}
const FunctionDoc quantile_doc{
"Compute an array of quantiles of a numeric array or chunked array",
("By default, 0.5 quantile (median) is returned.\n"
"If quantile lies between two data points, an interpolated value is\n"
"returned based on selected interpolation method.\n"
"Nulls and NaNs are ignored.\n"
"An empty array is returned if there is no valid data point."),
{"array"},
"QuantileOptions"};
} // namespace
void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
static QuantileOptions default_options;
auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
&default_options);
AddQuantileKernels(func.get());
DCHECK_OK(registry->AddFunction(std::move(func)));
}
} // namespace internal
} // namespace compute
} // namespace arrow