blob: 91d9313ba7d2e618b59585ae535cea20a5163091 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "udf/udf.h"
namespace impala {
using impala_udf::BigIntVal;
using impala_udf::DoubleVal;
using impala_udf::FloatVal;
using impala_udf::FunctionContext;
using impala_udf::StringVal;
class DataSketchesFunctions {
public:
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. Otherwise, returns the count(distinct) estimate
/// from the sketch.
static BigIntVal DsHllEstimate(FunctionContext* ctx,
const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches HLL sketch. If
/// it is not, then the query fails. This function returns the stringified format of
/// an Apache DataSketches HLL sketch.
static StringVal DsHllStringify(FunctionContext* ctx,
const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. 'rank' is used to identify which item (estimate)
/// to return from the sketched dataset. E.g. 0.1 means the item where 10% of the
/// sketched dataset is lower or equals to this particular item. 'rank' should be in
/// the range of [0,1]. Otherwise this function returns error.
static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& serialized_sketch,
const DoubleVal& rank);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// Returns the number of input values fed to 'serialized_sketch'.
static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& serialized_sketch);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function returns a value in the range of [0,1]
/// where e.g. 0.2 means that 'probe_value' is greater than the 20% of the values in
/// 'serialized_sketch'. Note, this is an approximate calculation.
static DoubleVal DsKllRank(FunctionContext* ctx, const StringVal& serialized_sketch,
const FloatVal& probe_value);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function is similar to DsKllQuantile() but
/// this one can receive multiple ranks and returns a comma separated string that
/// contains the results for all the given ranks.
/// Note, this function is meant to return an Array of floats as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllQuantilesAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const DoubleVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// 'args' holds one or more numbers that will be used as ranges to divide the input
/// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
/// (-inf, 1.0), [1.0, 3.5), [3.5, 10.1), [10.1, +inf)
/// This function returns a comma separated string that contains the probability of
/// having an item in each of the received ranges. E.g. a return value of 0.2 means
/// that approximately 20% of the items are in that given range.
/// Note, this function is meant to return an Array of doubles as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllPMFAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails.
/// 'args' holds one or more numbers that will be used as ranges to divide the input
/// of the sketch. E.g. [1.0, 3.5, 10.1] will create the following ranges:
/// (-inf, 1.0), (-inf, 3.5), (-inf, 10.1), (-inf, +inf)
/// This function returns a comma separated string that contains the probability of
/// having an item in each of the received ranges. E.g. a return value of 0.2 means
/// that approximately 20% of the items are in that given range.
/// Note, this function is meant to return an Array of doubles as the result but with
/// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
static StringVal DsKllCDFAsString(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args);
/// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
/// it is not, then the query fails. This function returns the stringified format of
/// an Apache DataSketches sketch.
static StringVal DsKllStringify(FunctionContext* ctx,
const StringVal& serialized_sketch);
private:
enum PMFCDF {
PMF,
CDF
};
/// Helper functions for DsKllPMFAsString() and DsKllCDFAsString(). 'mode' indicates
/// whether get_PMF() or get_CDF() should be invoked on the KLL sketch.
static StringVal GetDsKllPMFOrCDF(FunctionContext* ctx,
const StringVal& serialized_sketch, int num_args, const FloatVal* args,
PMFCDF mode);
};
}