blob: 8b3ae137934ae55220c29429435650fe9e0b874d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_UTIL_MIN_MAX_FILTER_H
#define IMPALA_UTIL_MIN_MAX_FILTER_H
#include "gen-cpp/ImpalaInternalService_types.h"
#include "impala-ir/impala-ir-functions.h"
#include "runtime/date-value.h"
#include "runtime/decimal-value.h"
#include "runtime/string-buffer.h"
#include "runtime/string-value.h"
#include "runtime/timestamp-value.h"
#include "runtime/types.h"
namespace impala {
class MemTacker;
class ObjectPool;
/// A MinMaxFilter tracks the min and max currently seen values in a data set for use in
/// runtime filters.
///
/// Filters are constructed using MinMaxFilter::Create() which returns a MinMaxFilter of
/// the appropriate type. Values can then be added using Insert(), InsertForLE(),
/// InsertForLT(), InsertForGE() or InsertForGT, and the min and max can be retrieved
/// using GetMin()/GetMax().
///
/// MinMaxFilters ignore NULL values, and so are only appropriate to use as a runtime
/// filter if the join predicate is '=' and not 'is not distinct from'.
class MinMaxFilter {
public:
MinMaxFilter() : always_true_(false) {}
virtual ~MinMaxFilter() {}
virtual void Close() {}
/// Returns the min/max values in the tuple slot representation. It is not valid to call
/// these functions if AlwaysFalse() returns true.
virtual const void* GetMin() const = 0;
virtual const void* GetMax() const = 0;
/// Returns the min/max values in the out paramsters 'out_min'/'out_max', converted to
/// fit into 'type', eg. if the calculated max value is greater than the max value for
/// 'type', the returned max is the max for 'type'. Returns false if the entire range
/// from the calculated min to max is outside the range for 'type'. May only be called
/// for integer-typed filters.
virtual bool GetCastIntMinMax(
const ColumnType& type, int64_t* out_min, int64_t* out_max) const;
/// Determine whether two ranges: [data_min, data_max] and [filter_min, filter_max]
/// overlap. Return true if the two overlaps and false otherwise. Both data_min and
/// data_max are of type 'type'. Since the overlap result can be used to set the
/// the always_true_ flag, the actual overlap computation ignores the flag.
virtual bool EvalOverlap(
const ColumnType& type, void* data_min, void* data_max) const = 0;
virtual PrimitiveType type() const = 0;
/// Add a new value, updating the current min/max.
virtual void Insert(const void* val) = 0;
/// Add a new value, updating the current min/max when always false is true, or
/// only the current max otherwise.
virtual void InsertForLE(const void* val) = 0;
/// Add a new value which is 'val'-1, updating the current min/max when always false is
/// true, or only the current max otherwise.
virtual void InsertForLT(const void* val) = 0;
/// Add a new value, updating the current min/max when always false is true, or
/// only the current min otherwise.
virtual void InsertForGE(const void* val) = 0;
/// Add a new value which is 'val'-1, updating the current min/max when always false is
/// true, or only the current min otherwise.
virtual void InsertForGT(const void* val) = 0;
/// If true, this filter doesn't allow any rows to pass.
virtual bool AlwaysFalse() const = 0;
/// Materialize filter values by copying any values stored by filters into memory owned
/// by the filter. Filters may assume that the memory for Insert()-ed values stays valid
/// until this is called.
virtual void MaterializeValues() {}
/// Convert this filter to a protobuf representation.
virtual void ToProtobuf(MinMaxFilterPB* protobuf) const = 0;
virtual std::string DebugString() const = 0;
/// Returns a new MinMaxFilter with the given type, allocated from 'mem_tracker'.
static MinMaxFilter* Create(ColumnType type, ObjectPool* pool, MemTracker* mem_tracker);
/// Returns a new MinMaxFilter created from the protobuf representation, allocated from
/// 'mem_tracker'.
static MinMaxFilter* Create(const MinMaxFilterPB& protobuf, ColumnType type,
ObjectPool* pool, MemTracker* mem_tracker);
/// Updates this filter with the logical OR of this filter and 'other'.
void Or(const MinMaxFilter& other);
/// Computes the logical OR of 'in' with 'out' and stores the result in 'out'.
static void Or(
const MinMaxFilterPB& in, MinMaxFilterPB* out, const ColumnType& columnType);
/// Copies the contents of 'in' into 'out'.
static void Copy(const MinMaxFilterPB& in, MinMaxFilterPB* out);
/// Returns the LLVM_CLASS_NAME for the given type.
static std::string GetLlvmClassName(PrimitiveType type);
/// Returns the LLVM_CLASS_NAME for this base class 'MinMaxFilter'.
static const char* LLVM_CLASS_NAME;
/// Returns the IRFunction::Type for Insert() for the given type.
static IRFunction::Type GetInsertIRFunctionType(ColumnType col_type);
/// Returns the IRFunction::Type for AlwaysTrue() for the given type.
static IRFunction::Type GetAlwaysTrueIRFunctionType(ColumnType col_type);
virtual bool AlwaysTrue() const = 0;
/// Compute and return the ratio of the filter overlapping with a range defined by
/// [data_min, data_max] of type 'type' as follows.
/// 1. If there is no overlapping, ratio = 0.0;
/// 2. If the filter completely covers the data range, ratio = 1.0;
/// 3. If the filter covers the data range to a certain degree,
/// ratio = (overlapped area) / (data area), where
/// overlapped area = min(filter_max, data_max) - max(filter_min, data_min) + 1
/// data area = data_max - data_min + 1
virtual float ComputeOverlapRatio(
const ColumnType& type, void* data_min, void* data_max) = 0;
/// Compute and return the ratio of the filter overlapping with a range defined by
/// [data_min, data_max] of type 'type' where the min and max are of TColumnValues.
/// This version of the method calls ComputeOverlapRatio(const ColumnType&, void*,
/// void*) with the proper fields in TColumnValues cast to void*.
///
/// This method is used mainly by hash join builder to evaluate the usefulness of
/// a min/max filter against the min/max column stats, and implemented only for a
/// subset of primitive column types (INTEGER, FLAOT, DOUBLE, DATE and DECIMAL) for
/// which the column stats can be stored in HMS.
///
/// The default implementation returns an overlap ratio of 0.0.
virtual float ComputeOverlapRatio(const ColumnType& type, const TColumnValue& data_min,
const TColumnValue& data_max) { return 0.0; }
/// Makes this filter always return true.
virtual void SetAlwaysTrue() { always_true_ = true; }
/// Return a debug string for 'filter'
static std::string DebugString(
const MinMaxFilterPB& filter, const ColumnType& col_type);
/// Test whether 'filter' is always true field is set and the field value is 'true'
static bool AlwaysTrue(const MinMaxFilterPB& filter);
/// Test whether 'filter' is always false field is set and the field value is 'false'
static bool AlwaysFalse(const MinMaxFilterPB& filter);
/// Return a debug string for 'value'
static std::string DebugString(const ColumnValuePB& value, const ColumnType& col_type);
protected:
bool always_true_;
};
#define NUMERIC_MIN_MAX_FILTER(NAME, TYPE) \
class NAME##MinMaxFilter : public MinMaxFilter { \
public: \
NAME##MinMaxFilter() { \
min_ = std::numeric_limits<TYPE>::max(); \
max_ = std::numeric_limits<TYPE>::lowest(); \
} \
NAME##MinMaxFilter(const MinMaxFilterPB& protobuf); \
virtual ~NAME##MinMaxFilter() {} \
virtual const void* GetMin() const override { return &min_; } \
virtual const void* GetMax() const override { return &max_; } \
virtual bool GetCastIntMinMax( \
const ColumnType& type, int64_t* out_min, int64_t* out_max) const override; \
bool EvalOverlap( \
const ColumnType& type, void* data_min, void* data_max) const override; \
float ComputeOverlapRatio( \
const ColumnType& type, void* data_min, void* data_max) override; \
float ComputeOverlapRatio(const ColumnType& type, const TColumnValue& data_min, \
const TColumnValue& data_max) override; \
virtual PrimitiveType type() const override; \
virtual void Insert(const void* val) override; \
virtual void InsertForLE(const void* val) override; \
virtual void InsertForLT(const void* val) override; \
virtual void InsertForGE(const void* val) override; \
virtual void InsertForGT(const void* val) override; \
bool AlwaysTrue() const override; \
virtual bool AlwaysFalse() const override { \
return min_ == std::numeric_limits<TYPE>::max() \
&& max_ == std::numeric_limits<TYPE>::lowest(); \
} \
virtual void ToProtobuf(MinMaxFilterPB* protobuf) const override; \
virtual std::string DebugString() const override; \
static void Or(const MinMaxFilterPB& in, MinMaxFilterPB* out); \
static void Copy(const MinMaxFilterPB& in, MinMaxFilterPB* out); \
static const char* LLVM_CLASS_NAME; \
\
private: \
bool EvalOverlap(const ColumnType& type, void* data_min, void* data_max, \
int64_t* filter_min64, int64_t* filter_max64) const; \
\
private: \
TYPE min_; \
TYPE max_; \
};
NUMERIC_MIN_MAX_FILTER(Bool, bool);
NUMERIC_MIN_MAX_FILTER(TinyInt, int8_t);
NUMERIC_MIN_MAX_FILTER(SmallInt, int16_t);
NUMERIC_MIN_MAX_FILTER(Int, int32_t);
NUMERIC_MIN_MAX_FILTER(BigInt, int64_t);
NUMERIC_MIN_MAX_FILTER(Float, float);
NUMERIC_MIN_MAX_FILTER(Double, double);
int64_t GetIntTypeValue(const ColumnType& type, const void* value);
class StringMinMaxFilter : public MinMaxFilter {
public:
StringMinMaxFilter(MemTracker* mem_tracker)
: MinMaxFilter(),
mem_pool_(mem_tracker),
min_buffer_(&mem_pool_),
max_buffer_(&mem_pool_),
always_false_(true) {}
StringMinMaxFilter(const MinMaxFilterPB& protobuf, MemTracker* mem_tracker);
virtual ~StringMinMaxFilter() {}
static const std::string min_string; // a string of 1 byte of 0x0.
static const std::string max_string; // a string of MAX_BOUND_LENGTH bytes of 0xff.
static const StringValue MIN_BOUND_STRING;
static const StringValue MAX_BOUND_STRING;
virtual void Close() override { mem_pool_.FreeAll(); }
virtual const void* GetMin() const override { return &min_; }
virtual const void* GetMax() const override { return &max_; }
virtual PrimitiveType type() const override;
virtual void Insert(const void* val) override;
// These four version of insert methods materialize the min_ and max_
// by making them pointing at min_buffer/max_buffer_.
virtual void InsertForLE(const void* val) override;
virtual void InsertForLT(const void* val) override;
virtual void InsertForGE(const void* val) override;
virtual void InsertForGT(const void* val) override;
bool AlwaysTrue() const override;
virtual bool AlwaysFalse() const override { return always_false_; }
bool EvalOverlap(
const ColumnType& type, void* data_min, void* data_max) const override;
virtual float ComputeOverlapRatio(
const ColumnType& type, void* data_min, void* data_max) override;
/// Copies the values pointed to by 'min_'/'max_' into 'min_buffer_'/'max_buffer_',
/// truncating them if necessary.
virtual void MaterializeValues() override;
virtual void ToProtobuf(MinMaxFilterPB* protobuf) const override;
virtual std::string DebugString() const override;
static void Or(const MinMaxFilterPB& in, MinMaxFilterPB* out);
static void Copy(const MinMaxFilterPB& in, MinMaxFilterPB* out);
/// Struct name in LLVM IR.
static const char* LLVM_CLASS_NAME;
protected:
virtual void SetAlwaysTrue() override;
// Update min_ and max_ to [value, MAX_BOUND_STRING]
void UpdateMin(const StringValue& value);
// Update min_ and max_ to [MIN_BOUND_STRING, value]
void UpdateMax(const StringValue& value);
// Perform the real work to materialize the min value to min_buffer_;
void MaterializeMinValue();
// Perform the real work to materialize the max value to max_buffer_;
void MaterializeMaxValue();
private:
/// Copies the contents of 'value' into 'buffer', up to 'len', and reassignes 'value' to
/// point to 'buffer', except for small strings which are not modified.
/// If an oom is hit, disables the filter by setting 'always_true_' to true.
void CopyToBuffer(StringBuffer* buffer, StringValue* value, int64_t len);
/// The maximum length of string to store in 'min_str_' or 'max_str_'. Strings inserted
/// into this filter that are longer than this will be truncated.
static const int MAX_BOUND_LENGTH;
/// MemPool that 'min_buffer_'/'max_buffer_' are allocated from.
MemPool mem_pool_;
/// The min/max values. After a call to MaterializeValues() these will point to
/// 'min_buffer_'/'max_buffer_'.
StringValue min_;
StringValue max_;
/// Local buffers to copy min/max data into. If Insert() was called and 'min_'/'max_'
/// was updated, these will be empty until MaterializeValues() is called.
StringBuffer min_buffer_;
StringBuffer max_buffer_;
/// True if no rows have been inserted.
bool always_false_;
};
#define DATE_TIME_MIN_MAX_FILTER(NAME, TYPE) \
class NAME##MinMaxFilter : public MinMaxFilter { \
public: \
NAME##MinMaxFilter() { always_false_ = true; } \
NAME##MinMaxFilter(const MinMaxFilterPB& protobuf); \
virtual ~NAME##MinMaxFilter() {} \
virtual const void* GetMin() const override { return &min_; } \
virtual const void* GetMax() const override { return &max_; } \
virtual PrimitiveType type() const override; \
virtual void Insert(const void* val) override; \
virtual void InsertForLE(const void* val) override; \
virtual void InsertForLT(const void* val) override; \
virtual void InsertForGE(const void* val) override; \
virtual void InsertForGT(const void* val) override; \
bool AlwaysTrue() const override; \
virtual bool AlwaysFalse() const override { return always_false_; } \
bool EvalOverlap( \
const ColumnType& type, void* data_min, void* data_max) const override; \
virtual float ComputeOverlapRatio( \
const ColumnType& type, void* data_min, void* data_max) override; \
virtual float ComputeOverlapRatio(const ColumnType& type, \
const TColumnValue& data_min, const TColumnValue& data_max) override; \
virtual void ToProtobuf(MinMaxFilterPB* protobuf) const override; \
virtual std::string DebugString() const override; \
static void Or(const MinMaxFilterPB& in, MinMaxFilterPB* out); \
static void Copy(const MinMaxFilterPB& in, MinMaxFilterPB* out); \
static const char* LLVM_CLASS_NAME; \
\
private: \
void UpdateMin(const TYPE& val); \
void UpdateMax(const TYPE& val); \
\
private: \
TYPE min_; \
TYPE max_; \
/* True if no rows have been inserted. */ \
bool always_false_; \
};
DATE_TIME_MIN_MAX_FILTER(Timestamp, TimestampValue);
DATE_TIME_MIN_MAX_FILTER(Date, DateValue);
#define DECIMAL_SIZE_4BYTE 4
#define DECIMAL_SIZE_8BYTE 8
#define DECIMAL_SIZE_16BYTE 16
// body of GetMin and GetMax defined below
#pragma push_macro("GET_MINMAX")
#define GET_MINMAX(MIN_OR_MAX) \
do { \
switch (size_) { \
case DECIMAL_SIZE_4BYTE: \
return &(MIN_OR_MAX##4_); \
break; \
case DECIMAL_SIZE_8BYTE: \
return &(MIN_OR_MAX##8_); \
break; \
case DECIMAL_SIZE_16BYTE: \
return &(MIN_OR_MAX##16_); \
break; \
default: \
DCHECK(false) << "Unknown decimal size: " << size_; \
} \
} while (false)
// Decimal data can be stored using 4 bytes, 8 bytes or 16 bytes. The filter
// is for a particular column, and the size needed is fixed based on the
// column specification. So, a union is used to store the min and max value.
// The precision comes in as input. Based on the precision the size is found and
// the respective variable will be used to store the value. The code is
// macro-ized to handle different sized decimals.
class DecimalMinMaxFilter : public MinMaxFilter {
public:
DecimalMinMaxFilter(int precision)
: size_(ColumnType::GetDecimalByteSize(precision)), always_false_(true) {}
DecimalMinMaxFilter(const MinMaxFilterPB& protobuf, int precision);
virtual ~DecimalMinMaxFilter() {}
virtual const void* GetMin() const override {
GET_MINMAX(min);
return nullptr;
}
virtual const void* GetMax() const override {
GET_MINMAX(max);
return nullptr;
}
virtual void Insert(const void* val) override;
virtual void InsertForLE(const void* val) override;
virtual void InsertForLT(const void* val) override;
virtual void InsertForGE(const void* val) override;
virtual void InsertForGT(const void* val) override;
virtual PrimitiveType type() const override;
bool AlwaysTrue() const override;
virtual bool AlwaysFalse() const override { return always_false_; }
virtual void SetAlwaysFalse() { always_false_ = true; }
virtual void ToProtobuf(MinMaxFilterPB* protobuf) const override;
virtual std::string DebugString() const override;
static void Or(const MinMaxFilterPB& in, MinMaxFilterPB* out, int precision);
static void Copy(const MinMaxFilterPB& in, MinMaxFilterPB* out);
bool EvalOverlap(
const ColumnType& type, void* data_min, void* data_max) const override;
virtual float ComputeOverlapRatio(
const ColumnType& type, void* data_min, void* data_max) override;
virtual float ComputeOverlapRatio(const ColumnType& type, const TColumnValue& data_min,
const TColumnValue& data_max) override;
void Insert4(const void* val);
void Insert8(const void* val);
void Insert16(const void* val);
void Insert4ForLE(const void* val);
void Insert8ForLE(const void* val);
void Insert16ForLE(const void* val);
void Insert4ForGE(const void* val);
void Insert8ForGE(const void* val);
void Insert16ForGE(const void* val);
void Insert4ForLT(const void* val);
void Insert8ForLT(const void* val);
void Insert16ForLT(const void* val);
void Insert4ForGT(const void* val);
void Insert8ForGT(const void* val);
void Insert16ForGT(const void* val);
void UpdateMin(const Decimal4Value&);
void UpdateMin(const Decimal8Value&);
void UpdateMin(const Decimal16Value&);
void UpdateMax(const Decimal4Value&);
void UpdateMax(const Decimal8Value&);
void UpdateMax(const Decimal16Value&);
/// Struct name in LLVM IR.
static const char* LLVM_CLASS_NAME;
private:
const int size_;
union {
Decimal16Value min16_;
Decimal8Value min8_;
Decimal4Value min4_;
};
union {
Decimal16Value max16_;
Decimal8Value max8_;
Decimal4Value max4_;
};
/// True if no rows have been inserted.
bool always_false_;
};
#pragma pop_macro("GET_MINMAX")
}
#endif