blob: 3f44793e2f9b468389dc142e3dd253d1878880a3 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "udf/udf.h"
namespace impala {
using impala_udf::IntVal;
using impala_udf::BigIntVal;
using impala_udf::DoubleVal;
using impala_udf::FloatVal;
using impala_udf::FunctionContext;
using impala_udf::StringVal;
class DataSketchesFunctions {
public:
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. Otherwise, returns the count(distinct) estimate
/// from the sketch.
static BigIntVal DsHllEstimate(FunctionContext* ctx,
const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. Otherwise, returns an estimate of distinct count
/// and bounds from the sketch.
/// The result is three values: estimate, lower bound and upper bound.
/// Note, this function is meant to return an Array of doubles as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsHllEstimateBoundsAsString(
FunctionContext* ctx, const StringVal& serialized_sketch);
/// utilizing the kappa value indirectly specified through the 2nd argument in
/// ds_hll_estimate_bounds_as_string()
///'kappa' is a number of standard deviations from the mean: 1, 2 or 3 (default 2).
static StringVal DsHllEstimateBoundsAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, const IntVal& kappa);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. Union two sketches and returns the resulting
/// sketch of union.
static StringVal DsHllUnionF(FunctionContext* ctx,
const StringVal& first_serialized_sketch,
const StringVal& second_serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. This function returns the stringified format of
/// an Apache DataSketches HLL sketch.
static StringVal DsHllStringify(FunctionContext* ctx,
const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches CPC sketch. If
/// it is not, then the query fails. Otherwise, returns the count(distinct) estimate
/// from the sketch.
static BigIntVal DsCpcEstimate(
FunctionContext* ctx, const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches CPC sketch. If
/// it is not, then the query fails. This function returns the stringified format of
/// an Apache DataSketches CPC sketch.
static StringVal DsCpcStringify(
FunctionContext* ctx, const StringVal& serialized_sketch);
/// 'first_serialized_sketch' and 'second_serialized_sketch' are both expected as
/// serialized Apache DataSketches CPC sketches. If they are not, then the query
/// fails. Union two sketches and returns the resulting sketch of union.
static StringVal DsCpcUnionF(FunctionContext* ctx,
const StringVal& first_serialized_sketch,
const StringVal& second_serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches Theta sketch.
/// If it is not, then the query fails. Otherwise, returns the count(distinct) estimate
/// from the sketch.
static BigIntVal DsThetaEstimate(
FunctionContext* ctx, const StringVal& serialized_sketch);
/// 'first_serialized_sketch' and 'second_serialized_sketch' are both expected as
/// serialized Apache DataSketches Theta sketches. If they are not, then the query
/// fails. Computes the a-not-b set operation given two sketches of same or different
/// column.
static StringVal DsThetaExclude(FunctionContext* ctx,
const StringVal& first_serialized_sketch,
const StringVal& second_serialized_sketch);
/// 'first_serialized_sketch' and 'second_serialized_sketch' are both expected as
/// serialized Apache DataSketches Theta sketches. If they are not, then the query
/// fails. Union two sketches and returns the resulting sketch of union.
static StringVal DsThetaUnionF(FunctionContext* ctx,
const StringVal& first_serialized_sketch,
const StringVal& second_serialized_sketch);
/// 'first_serialized_sketch' and 'second_serialized_sketch' are both expected as
/// serialized Apache DataSketches Theta sketches. If they are not, then the query
/// fails. Intersect two sketches and return the resulting sketch after the
/// intersection.
static StringVal DsThetaIntersectF(FunctionContext* ctx,
const StringVal& first_serialized_sketch,
const StringVal& second_serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. 'rank' is used to identify which item (estimate)
/// to return from the sketched dataset. E.g. 0.1 means the item where 10% of the
/// sketched dataset is lower or equals to this particular item. 'rank' should be in
/// the range of [0,1]. Otherwise this function returns error.
static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& serialized_sketch,
const DoubleVal& rank);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// Returns the number of input values fed to 'serialized_sketch'.
static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function returns a value in the range of [0,1]
/// where e.g. 0.2 means that 'probe_value' is greater than the 20% of the values in
/// 'serialized_sketch'. Note, this is an approximate calculation.
static DoubleVal DsKllRank(FunctionContext* ctx, const StringVal& serialized_sketch,
const FloatVal& probe_value);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function is similar to DsKllQuantile() but
/// this one can receive multiple ranks and returns a comma separated string that
/// contains the results for all the given ranks.
/// Note, this function is meant to return an Array of floats as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllQuantilesAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const DoubleVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// 'args' holds one or more numbers that will be used as ranges to divide the input
/// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
/// (-inf, 1.0), [1.0, 3.5), [3.5, 10.1), [10.1, +inf)
/// This function returns a comma separated string that contains the probability of
/// having an item in each of the received ranges. E.g. a return value of 0.2 means
/// that approximately 20% of the items are in that given range.
/// Note, this function is meant to return an Array of doubles as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllPMFAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// 'args' holds one or more numbers that will be used as ranges to divide the input
/// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
/// (-inf, 1.0), (-inf, 3.5), (-inf, 10.1), (-inf, +inf)
/// This function returns a comma separated string that contains the probability of
/// having an item in each of the received ranges. E.g. a return value of 0.2 means
/// that approximately 20% of the items are in that given range.
/// Note, this function is meant to return an Array of doubles as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllCDFAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function returns the stringified format of
/// an Apache DataSketches sketch.
static StringVal DsKllStringify(FunctionContext* ctx,
const StringVal& serialized_sketch);
private:
enum PMFCDF {
PMF,
CDF
};
/// Helper functions for DsKllPMFAsString() and DsKllCDFAsString(). 'mode' indicates
/// whether get_PMF() or get_CDF() should be invoked on the KLL sketch.
static StringVal GetDsKllPMFOrCDF(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args,
PMFCDF mode);
};
}