blob: 28c182a7703b89a7bb227176fcc8fc3b630c204c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "kudu/common/column_predicate.h"
#include <algorithm>
#include <utility>
#include "kudu/common/key_util.h"
#include "kudu/common/rowblock.h"
#include "kudu/common/schema.h"
#include "kudu/common/types.h"
#include "kudu/util/memory/arena.h"
using std::move;
using std::vector;
namespace kudu {
ColumnPredicate::ColumnPredicate(PredicateType predicate_type,
ColumnSchema column,
const void* lower,
const void* upper)
: predicate_type_(predicate_type),
column_(move(column)),
lower_(lower),
upper_(upper) {
}
ColumnPredicate::ColumnPredicate(PredicateType predicate_type,
ColumnSchema column,
vector<const void*>* values)
: predicate_type_(predicate_type),
column_(move(column)),
lower_(nullptr),
upper_(nullptr) {
values_.swap(*values);
}
ColumnPredicate ColumnPredicate::Equality(ColumnSchema column, const void* value) {
CHECK(value != nullptr);
return ColumnPredicate(PredicateType::Equality, move(column), value, nullptr);
}
ColumnPredicate ColumnPredicate::Range(ColumnSchema column,
const void* lower,
const void* upper) {
CHECK(lower != nullptr || upper != nullptr);
ColumnPredicate pred(PredicateType::Range, move(column), lower, upper);
pred.Simplify();
return pred;
}
ColumnPredicate ColumnPredicate::InList(ColumnSchema column,
vector<const void*>* values) {
CHECK(values != nullptr);
// Sort values and remove duplicates.
std::sort(values->begin(), values->end(),
[&] (const void* a, const void* b) {
return column.type_info()->Compare(a, b) < 0;
});
values->erase(std::unique(values->begin(), values->end(),
[&] (const void* a, const void* b) {
return column.type_info()->Compare(a, b) == 0;
}),
values->end());
ColumnPredicate pred(PredicateType::InList, move(column), values);
pred.Simplify();
return pred;
}
boost::optional<ColumnPredicate> ColumnPredicate::InclusiveRange(ColumnSchema column,
const void* lower,
const void* upper,
Arena* arena) {
CHECK(lower != nullptr || upper != nullptr);
if (upper != nullptr) {
// Transform the upper bound to exclusive by incrementing it.
// Make a copy of the value before incrementing in case it's aliased.
size_t size = column.type_info()->size();
void* buf = CHECK_NOTNULL(arena->AllocateBytes(size));
memcpy(buf, upper, size);
if (!key_util::IncrementCell(column, buf, arena)) {
if (lower == nullptr) {
if (column.is_nullable()) {
// If incrementing the upper bound fails and the column is nullable,
// then return an IS NOT NULL predicate, so that null values will be
// filtered.
return ColumnPredicate::IsNotNull(move(column));
} else {
return boost::none;
}
} else {
upper = nullptr;
}
} else {
upper = buf;
}
}
return ColumnPredicate::Range(move(column), lower, upper);
}
ColumnPredicate ColumnPredicate::ExclusiveRange(ColumnSchema column,
const void* lower,
const void* upper,
Arena* arena) {
CHECK(lower != nullptr || upper != nullptr);
if (lower != nullptr) {
// Transform the lower bound to inclusive by incrementing it.
// Make a copy of the value before incrementing in case it's aliased.
size_t size = column.type_info()->size();
void* buf = CHECK_NOTNULL(arena->AllocateBytes(size));
memcpy(buf, lower, size);
if (!key_util::IncrementCell(column, buf, arena)) {
// If incrementing the lower bound fails then the predicate can match no values.
return ColumnPredicate::None(move(column));
} else {
lower = buf;
}
}
return ColumnPredicate::Range(move(column), lower, upper);
}
ColumnPredicate ColumnPredicate::IsNotNull(ColumnSchema column) {
return ColumnPredicate(PredicateType::IsNotNull, move(column), nullptr, nullptr);
}
ColumnPredicate ColumnPredicate::None(ColumnSchema column) {
return ColumnPredicate(PredicateType::None, move(column), nullptr, nullptr);
}
void ColumnPredicate::SetToNone() {
predicate_type_ = PredicateType::None;
lower_ = nullptr;
upper_ = nullptr;
}
void ColumnPredicate::Simplify() {
// TODO(dan): we are missing some simplification opportunities here:
// * range predicates including the entire range of a bool/integer
// (`my_int8 >= -127`) can be simplified to `IS NOT NULL`.
// * `IN` predicates including all values of a bool/integer
// (`my_bool IN (true, false)`) can be simplified to `IS NOT NULL`.
switch (predicate_type_) {
case PredicateType::None:
case PredicateType::Equality:
case PredicateType::IsNotNull: return;
case PredicateType::Range: {
if (lower_ != nullptr && upper_ != nullptr) {
if (column_.type_info()->Compare(lower_, upper_) >= 0) {
// If the range bounds are empty then no results can be returned.
SetToNone();
} else if (column_.type_info()->AreConsecutive(lower_, upper_)) {
// If the values are consecutive, then it is an equality bound.
predicate_type_ = PredicateType::Equality;
upper_ = nullptr;
}
}
return;
};
case PredicateType::InList: {
if (values_.empty()) {
// If the list is empty, no results can be returned.
SetToNone();
} else if (values_.size() == 1) {
// List has only one value, so convert to Equality
predicate_type_ = PredicateType::Equality;
lower_ = values_[0];
values_.clear();
} else if (column_.type_info()->type() == BOOL) {
// If this is a boolean IN list with both true and false in the list,
// then we can just convert it to IS NOT NULL. This same simplification
// could be done for other integer types, but it's probably not as
// common (and hard to test).
predicate_type_ = PredicateType::IsNotNull;
lower_ = nullptr;
upper_ = nullptr;
values_.clear();
}
return;
};
}
LOG(FATAL) << "unknown predicate type";
}
void ColumnPredicate::Merge(const ColumnPredicate& other) {
CHECK(column_.Equals(other.column_, false));
switch (predicate_type_) {
case PredicateType::None: return;
case PredicateType::Range: {
MergeIntoRange(other);
return;
};
case PredicateType::Equality: {
MergeIntoEquality(other);
return;
};
case PredicateType::IsNotNull: {
// NOT NULL is less selective than all other predicate types, so the
// intersection of NOT NULL with any other predicate is just the other
// predicate.
//
// Note: this will no longer be true when an IS NULL predicate type is
// added.
predicate_type_ = other.predicate_type_;
lower_ = other.lower_;
upper_ = other.upper_;
values_ = other.values_;
return;
};
case PredicateType::InList: {
MergeIntoInList(other);
return;
};
}
LOG(FATAL) << "unknown predicate type";
}
void ColumnPredicate::MergeIntoRange(const ColumnPredicate& other) {
CHECK(predicate_type_ == PredicateType::Range);
switch (other.predicate_type()) {
case PredicateType::None: {
SetToNone();
return;
};
case PredicateType::Range: {
// Set the lower bound to the larger of the two.
if (other.lower_ != nullptr &&
(lower_ == nullptr || column_.type_info()->Compare(lower_, other.lower_) < 0)) {
lower_ = other.lower_;
}
// Set the upper bound to the smaller of the two.
if (other.upper_ != nullptr &&
(upper_ == nullptr || column_.type_info()->Compare(upper_, other.upper_) > 0)) {
upper_ = other.upper_;
}
Simplify();
return;
};
case PredicateType::Equality: {
if ((lower_ != nullptr && column_.type_info()->Compare(lower_, other.lower_) > 0) ||
(upper_ != nullptr && column_.type_info()->Compare(upper_, other.lower_) <= 0)) {
// The equality value does not fall in this range.
SetToNone();
} else {
predicate_type_ = PredicateType::Equality;
lower_ = other.lower_;
upper_ = nullptr;
}
return;
};
case PredicateType::IsNotNull: return;
case PredicateType::InList : {
// The InList predicate values are examined to check whether
// they lie in the range.
// The current predicate is then converted from a range predicate to
// an InList predicate (since it is more selective).
// The number of values in the InList will depend on how many
// values were within the range.
// The call to Simplify() will then convert the InList if appropriate:
// i.e an InList with zero entries gets converted to a NONE
// and InList with 1 entry gets converted into an Equality.
values_ = other.values_;
values_.erase(std::remove_if(values_.begin(), values_.end(),
[this] (const void* v) {
return !CheckValueInRange(v);
}), values_.end());
predicate_type_ = PredicateType::InList;
Simplify();
return;
};
}
LOG(FATAL) << "unknown predicate type";
}
void ColumnPredicate::MergeIntoEquality(const ColumnPredicate& other) {
CHECK(predicate_type_ == PredicateType::Equality);
switch (other.predicate_type()) {
case PredicateType::None: {
SetToNone();
return;
}
case PredicateType::Range: {
if (!other.CheckValueInRange(lower_)) {
// This equality value does not fall in the other range.
SetToNone();
}
return;
};
case PredicateType::Equality: {
if (column_.type_info()->Compare(lower_, other.lower_) != 0) {
SetToNone();
}
return;
};
case PredicateType::IsNotNull: return;
case PredicateType::InList : {
// The equality value needs to be a member of the InList
if (!other.CheckValueInList(lower_)) {
SetToNone();
}
return;
};
}
LOG(FATAL) << "unknown predicate type";
}
void ColumnPredicate::MergeIntoInList(const ColumnPredicate &other) {
CHECK(predicate_type_ == PredicateType::InList);
DCHECK(values_.size() > 1);
switch (other.predicate_type()) {
case PredicateType::None : {
SetToNone();
return;
};
case PredicateType::Range: {
// Only values within the range should be retained.
auto search_by = [&] (const void* lhs, const void* rhs) {
return this->column_.type_info()->Compare(lhs, rhs) < 0;
};
// Remove all values greater than the range.
if (other.upper_ != nullptr) {
// lower_bound is used here instead of upper_bound, since the upper
// bound of the range is exclusive, and the in list is inclusive.
auto upper = std::lower_bound(values_.begin(), values_.end(), other.upper_, search_by);
values_.erase(upper, values_.end());
}
// Remove all values less than the range.
if (other.lower_ != nullptr) {
auto lower = std::lower_bound(values_.begin(), values_.end(), other.lower_, search_by);
values_.erase(values_.begin(), lower);
}
Simplify();
return;
}
case PredicateType::Equality: {
if (CheckValueInList(other.lower_)) {
// value falls in list so change to Equality predicate
predicate_type_ = PredicateType::Equality;
lower_ = other.lower_;
upper_ = nullptr;
} else {
SetToNone(); // Value does not fall in list
}
return;
}
case PredicateType::IsNotNull: return;
case PredicateType::InList: {
// Merge the 'other' IN list into this IN list. The basic idea is to loop
// through this predicate list, retaining only the values which are also
// contained in the other predicate list. We apply an optimization first:
// all values from this in list which fall outside the range of the other
// IN list are removed, and the remaining values in this IN list are only
// checked against values in the other list which are in this list's
// range. This doesn't reduce the worst-case complexity, but can really
// speed up the merge for big lists in certain cases. This optimization
// relies on the lists being sorted.
DCHECK(other.values_.size() > 1);
auto search_by = [&] (const void* lhs, const void* rhs) {
return this->column_.type_info()->Compare(lhs, rhs) < 0;
};
// Remove all values in this IN list which are greater than the largest
// value in the other list.
values_.erase(std::upper_bound(values_.begin(), values_.end(),
other.values_.back(), search_by),
values_.end());
// Remove all values in this IN list which are less than the smallest
// value in the other list.
values_.erase(values_.begin(),
std::lower_bound(values_.begin(), values_.end(),
other.values_.front(), search_by));
if (values_.empty()) {
SetToNone();
return;
}
// Find the sublist in the other IN list which overlaps with this IN list.
auto other_start = std::lower_bound(other.values_.begin(), other.values_.end(),
values_.front(), search_by);
auto other_end = std::upper_bound(other_start, other.values_.end(),
values_.back(), search_by);
// Returns true if the value is *not* present in the other list.
// Modifies other_start to point at the position of v in the other list.
auto other_absent = [&] (const void* v) {
other_start = std::lower_bound(other_start, other_end, v, search_by);
return this->column_.type_info()->Compare(v, *other_start) != 0;
};
// Iterate through the values_ list and remove elements that do not exist
// in the other list.
values_.erase(std::remove_if(values_.begin(), values_.end(), other_absent), values_.end());
Simplify();
return;
};
}
LOG(FATAL) << "unknown predicate type";
}
namespace {
template <typename P>
void ApplyPredicate(const ColumnBlock& block, SelectionVector* sel, P p) {
if (block.is_nullable()) {
for (size_t i = 0; i < block.nrows(); i++) {
if (!sel->IsRowSelected(i)) continue;
const void* cell = block.nullable_cell_ptr(i);
if (cell == nullptr || !p(cell)) {
BitmapClear(sel->mutable_bitmap(), i);
}
}
} else {
for (size_t i = 0; i < block.nrows(); i++) {
if (!sel->IsRowSelected(i)) continue;
const void* cell = block.cell_ptr(i);
if (!p(cell)) {
BitmapClear(sel->mutable_bitmap(), i);
}
}
}
}
} // anonymous namespace
template <DataType PhysicalType>
void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block,
SelectionVector* sel) const {
switch (predicate_type()) {
case PredicateType::Range: {
if (lower_ == nullptr) {
ApplyPredicate(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0;
});
} else if (upper_ == nullptr) {
ApplyPredicate(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
});
} else {
ApplyPredicate(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0 &&
DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0;
});
}
return;
};
case PredicateType::Equality: {
ApplyPredicate(block, sel, [this] (const void* cell) {
return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) == 0;
});
return;
};
case PredicateType::IsNotNull: {
if (!block.is_nullable()) return;
// TODO: make this more efficient by using bitwise operations on the
// null and selection vectors.
for (size_t i = 0; i < block.nrows(); i++) {
if (sel->IsRowSelected(i) && block.is_null(i)) {
BitmapClear(sel->mutable_bitmap(), i);
}
}
return;
};
case PredicateType::InList: {
ApplyPredicate(block, sel, [this] (const void* cell) {
return std::binary_search(values_.begin(), values_.end(), cell,
[] (const void* lhs, const void* rhs) {
return DataTypeTraits<PhysicalType>::Compare(lhs, rhs) < 0;
});
});
return;
};
case PredicateType::None: LOG(FATAL) << "NONE predicate evaluation";
}
LOG(FATAL) << "unknown predicate type";
}
void ColumnPredicate::Evaluate(const ColumnBlock& block, SelectionVector* sel) const {
DCHECK(sel);
switch (block.type_info()->physical_type()) {
case BOOL: return EvaluateForPhysicalType<BOOL>(block, sel);
case INT8: return EvaluateForPhysicalType<INT8>(block, sel);
case INT16: return EvaluateForPhysicalType<INT16>(block, sel);
case INT32: return EvaluateForPhysicalType<INT32>(block, sel);
case INT64: return EvaluateForPhysicalType<INT64>(block, sel);
case UINT8: return EvaluateForPhysicalType<UINT8>(block, sel);
case UINT16: return EvaluateForPhysicalType<UINT16>(block, sel);
case UINT32: return EvaluateForPhysicalType<UINT32>(block, sel);
case UINT64: return EvaluateForPhysicalType<UINT64>(block, sel);
case FLOAT: return EvaluateForPhysicalType<FLOAT>(block, sel);
case DOUBLE: return EvaluateForPhysicalType<DOUBLE>(block, sel);
case BINARY: return EvaluateForPhysicalType<BINARY>(block, sel);
default: LOG(FATAL) << "unknown physical type: " << block.type_info()->physical_type();
}
}
string ColumnPredicate::ToString() const {
switch (predicate_type()) {
case PredicateType::None: return strings::Substitute("`$0` NONE", column_.name());
case PredicateType::Range: {
if (lower_ == nullptr) {
return strings::Substitute("`$0` < $1", column_.name(), column_.Stringify(upper_));
} else if (upper_ == nullptr) {
return strings::Substitute("`$0` >= $1", column_.name(), column_.Stringify(lower_));
} else {
return strings::Substitute("`$0` >= $1 AND `$0` < $2",
column_.name(),
column_.Stringify(lower_),
column_.Stringify(upper_));
}
};
case PredicateType::Equality: {
return strings::Substitute("`$0` = $1", column_.name(), column_.Stringify(lower_));
};
case PredicateType::IsNotNull: {
return strings::Substitute("`$0` IS NOT NULL", column_.name());
};
case PredicateType::InList: {
string ss = "`";
ss.append(column_.name());
ss.append("` IN (");
bool is_first = true;
for (auto* value : values_) {
if (is_first) {
is_first = false;
} else {
ss.append(", ");
}
ss.append(column_.Stringify(value));
}
ss.append(")");
return ss;
};
}
LOG(FATAL) << "unknown predicate type";
}
bool ColumnPredicate::operator==(const ColumnPredicate& other) const {
if (!column_.Equals(other.column_, false)) { return false; }
if (predicate_type_ != other.predicate_type_) {
return false;
}
switch (predicate_type_) {
case PredicateType::Equality: return column_.type_info()->Compare(lower_, other.lower_) == 0;
case PredicateType::Range: {
return (lower_ == other.lower_ ||
(lower_ != nullptr && other.lower_ != nullptr &&
column_.type_info()->Compare(lower_, other.lower_) == 0)) &&
(upper_ == other.upper_ ||
(upper_ != nullptr && other.upper_ != nullptr &&
column_.type_info()->Compare(upper_, other.upper_) == 0));
};
case PredicateType::InList: {
if (values_.size() != other.values_.size()) return false;
for (int i = 0; i < values_.size(); i++) {
if (column_.type_info()->Compare(values_[i], other.values_[i]) != 0) return false;
}
return true;
};
case PredicateType::None:
case PredicateType::IsNotNull: return true;
}
LOG(FATAL) << "unknown predicate type";
}
bool ColumnPredicate::CheckValueInRange(const void* value) const {
CHECK(predicate_type_ == PredicateType::Range);
return ((lower_ == nullptr || column_.type_info()->Compare(lower_, value) <= 0) &&
(upper_ == nullptr || column_.type_info()->Compare(upper_, value) > 0));
}
bool ColumnPredicate::CheckValueInList(const void* value) const {
return std::binary_search(values_.begin(), values_.end(), value,
[this](const void* lhs, const void* rhs) {
return this->column_.type_info()->Compare(lhs, rhs) < 0;
});
}
namespace {
int SelectivityRank(const ColumnPredicate& predicate) {
int rank;
switch (predicate.predicate_type()) {
case PredicateType::None: rank = 0; break;
case PredicateType::Equality: rank = 1; break;
case PredicateType::InList: rank = 2; break;
case PredicateType::Range: rank = 3; break;
case PredicateType::IsNotNull: rank = 4; break;
default: LOG(FATAL) << "unknown predicate type";
}
return rank * (kLargestTypeSize + 1) + predicate.column().type_info()->size();
}
} // anonymous namespace
int SelectivityComparator(const ColumnPredicate& left, const ColumnPredicate& right) {
return SelectivityRank(left) - SelectivityRank(right);
}
} // namespace kudu