| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "kudu/common/column_predicate.h" |
| |
| #include <algorithm> |
| #include <utility> |
| |
| #include "kudu/common/key_util.h" |
| #include "kudu/common/rowblock.h" |
| #include "kudu/common/schema.h" |
| #include "kudu/common/types.h" |
| #include "kudu/util/memory/arena.h" |
| |
| using std::move; |
| using std::vector; |
| |
| namespace kudu { |
| |
| ColumnPredicate::ColumnPredicate(PredicateType predicate_type, |
| ColumnSchema column, |
| const void* lower, |
| const void* upper) |
| : predicate_type_(predicate_type), |
| column_(move(column)), |
| lower_(lower), |
| upper_(upper) { |
| } |
| |
| ColumnPredicate::ColumnPredicate(PredicateType predicate_type, |
| ColumnSchema column, |
| vector<const void*>* values) |
| : predicate_type_(predicate_type), |
| column_(move(column)), |
| lower_(nullptr), |
| upper_(nullptr) { |
| values_.swap(*values); |
| } |
| |
| ColumnPredicate ColumnPredicate::Equality(ColumnSchema column, const void* value) { |
| CHECK(value != nullptr); |
| return ColumnPredicate(PredicateType::Equality, move(column), value, nullptr); |
| } |
| |
| ColumnPredicate ColumnPredicate::Range(ColumnSchema column, |
| const void* lower, |
| const void* upper) { |
| CHECK(lower != nullptr || upper != nullptr); |
| ColumnPredicate pred(PredicateType::Range, move(column), lower, upper); |
| pred.Simplify(); |
| return pred; |
| } |
| |
| ColumnPredicate ColumnPredicate::InList(ColumnSchema column, |
| vector<const void*>* values) { |
| CHECK(values != nullptr); |
| |
| // Sort values and remove duplicates. |
| std::sort(values->begin(), values->end(), |
| [&] (const void* a, const void* b) { |
| return column.type_info()->Compare(a, b) < 0; |
| }); |
| values->erase(std::unique(values->begin(), values->end(), |
| [&] (const void* a, const void* b) { |
| return column.type_info()->Compare(a, b) == 0; |
| }), |
| values->end()); |
| |
| ColumnPredicate pred(PredicateType::InList, move(column), values); |
| pred.Simplify(); |
| return pred; |
| } |
| |
| boost::optional<ColumnPredicate> ColumnPredicate::InclusiveRange(ColumnSchema column, |
| const void* lower, |
| const void* upper, |
| Arena* arena) { |
| CHECK(lower != nullptr || upper != nullptr); |
| |
| if (upper != nullptr) { |
| // Transform the upper bound to exclusive by incrementing it. |
| // Make a copy of the value before incrementing in case it's aliased. |
| size_t size = column.type_info()->size(); |
| void* buf = CHECK_NOTNULL(arena->AllocateBytes(size)); |
| memcpy(buf, upper, size); |
| if (!key_util::IncrementCell(column, buf, arena)) { |
| if (lower == nullptr) { |
| if (column.is_nullable()) { |
| // If incrementing the upper bound fails and the column is nullable, |
| // then return an IS NOT NULL predicate, so that null values will be |
| // filtered. |
| return ColumnPredicate::IsNotNull(move(column)); |
| } else { |
| return boost::none; |
| } |
| } else { |
| upper = nullptr; |
| } |
| } else { |
| upper = buf; |
| } |
| } |
| return ColumnPredicate::Range(move(column), lower, upper); |
| } |
| |
| ColumnPredicate ColumnPredicate::ExclusiveRange(ColumnSchema column, |
| const void* lower, |
| const void* upper, |
| Arena* arena) { |
| CHECK(lower != nullptr || upper != nullptr); |
| |
| if (lower != nullptr) { |
| // Transform the lower bound to inclusive by incrementing it. |
| // Make a copy of the value before incrementing in case it's aliased. |
| size_t size = column.type_info()->size(); |
| void* buf = CHECK_NOTNULL(arena->AllocateBytes(size)); |
| memcpy(buf, lower, size); |
| if (!key_util::IncrementCell(column, buf, arena)) { |
| // If incrementing the lower bound fails then the predicate can match no values. |
| return ColumnPredicate::None(move(column)); |
| } else { |
| lower = buf; |
| } |
| } |
| return ColumnPredicate::Range(move(column), lower, upper); |
| } |
| |
| ColumnPredicate ColumnPredicate::IsNotNull(ColumnSchema column) { |
| return ColumnPredicate(PredicateType::IsNotNull, move(column), nullptr, nullptr); |
| } |
| |
| ColumnPredicate ColumnPredicate::None(ColumnSchema column) { |
| return ColumnPredicate(PredicateType::None, move(column), nullptr, nullptr); |
| } |
| |
| void ColumnPredicate::SetToNone() { |
| predicate_type_ = PredicateType::None; |
| lower_ = nullptr; |
| upper_ = nullptr; |
| } |
| |
| void ColumnPredicate::Simplify() { |
| // TODO(dan): we are missing some simplification opportunities here: |
| // * range predicates including the entire range of a bool/integer |
| // (`my_int8 >= -127`) can be simplified to `IS NOT NULL`. |
| // * `IN` predicates including all values of a bool/integer |
| // (`my_bool IN (true, false)`) can be simplified to `IS NOT NULL`. |
| switch (predicate_type_) { |
| case PredicateType::None: |
| case PredicateType::Equality: |
| case PredicateType::IsNotNull: return; |
| case PredicateType::Range: { |
| if (lower_ != nullptr && upper_ != nullptr) { |
| if (column_.type_info()->Compare(lower_, upper_) >= 0) { |
| // If the range bounds are empty then no results can be returned. |
| SetToNone(); |
| } else if (column_.type_info()->AreConsecutive(lower_, upper_)) { |
| // If the values are consecutive, then it is an equality bound. |
| predicate_type_ = PredicateType::Equality; |
| upper_ = nullptr; |
| } |
| } |
| return; |
| }; |
| case PredicateType::InList: { |
| if (values_.empty()) { |
| // If the list is empty, no results can be returned. |
| SetToNone(); |
| } else if (values_.size() == 1) { |
| // List has only one value, so convert to Equality |
| predicate_type_ = PredicateType::Equality; |
| lower_ = values_[0]; |
| values_.clear(); |
| } else if (column_.type_info()->type() == BOOL) { |
| // If this is a boolean IN list with both true and false in the list, |
| // then we can just convert it to IS NOT NULL. This same simplification |
| // could be done for other integer types, but it's probably not as |
| // common (and hard to test). |
| predicate_type_ = PredicateType::IsNotNull; |
| lower_ = nullptr; |
| upper_ = nullptr; |
| values_.clear(); |
| } |
| return; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| void ColumnPredicate::Merge(const ColumnPredicate& other) { |
| CHECK(column_.Equals(other.column_, false)); |
| switch (predicate_type_) { |
| case PredicateType::None: return; |
| case PredicateType::Range: { |
| MergeIntoRange(other); |
| return; |
| }; |
| case PredicateType::Equality: { |
| MergeIntoEquality(other); |
| return; |
| }; |
| case PredicateType::IsNotNull: { |
| // NOT NULL is less selective than all other predicate types, so the |
| // intersection of NOT NULL with any other predicate is just the other |
| // predicate. |
| // |
| // Note: this will no longer be true when an IS NULL predicate type is |
| // added. |
| predicate_type_ = other.predicate_type_; |
| lower_ = other.lower_; |
| upper_ = other.upper_; |
| values_ = other.values_; |
| return; |
| }; |
| case PredicateType::InList: { |
| MergeIntoInList(other); |
| return; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| void ColumnPredicate::MergeIntoRange(const ColumnPredicate& other) { |
| CHECK(predicate_type_ == PredicateType::Range); |
| |
| switch (other.predicate_type()) { |
| case PredicateType::None: { |
| SetToNone(); |
| return; |
| }; |
| |
| case PredicateType::Range: { |
| // Set the lower bound to the larger of the two. |
| if (other.lower_ != nullptr && |
| (lower_ == nullptr || column_.type_info()->Compare(lower_, other.lower_) < 0)) { |
| lower_ = other.lower_; |
| } |
| |
| // Set the upper bound to the smaller of the two. |
| if (other.upper_ != nullptr && |
| (upper_ == nullptr || column_.type_info()->Compare(upper_, other.upper_) > 0)) { |
| upper_ = other.upper_; |
| } |
| |
| Simplify(); |
| return; |
| }; |
| |
| case PredicateType::Equality: { |
| if ((lower_ != nullptr && column_.type_info()->Compare(lower_, other.lower_) > 0) || |
| (upper_ != nullptr && column_.type_info()->Compare(upper_, other.lower_) <= 0)) { |
| // The equality value does not fall in this range. |
| SetToNone(); |
| } else { |
| predicate_type_ = PredicateType::Equality; |
| lower_ = other.lower_; |
| upper_ = nullptr; |
| } |
| return; |
| }; |
| case PredicateType::IsNotNull: return; |
| case PredicateType::InList : { |
| // The InList predicate values are examined to check whether |
| // they lie in the range. |
| // The current predicate is then converted from a range predicate to |
| // an InList predicate (since it is more selective). |
| // The number of values in the InList will depend on how many |
| // values were within the range. |
| // The call to Simplify() will then convert the InList if appropriate: |
| // i.e an InList with zero entries gets converted to a NONE |
| // and InList with 1 entry gets converted into an Equality. |
| values_ = other.values_; |
| values_.erase(std::remove_if(values_.begin(), values_.end(), |
| [this] (const void* v) { |
| return !CheckValueInRange(v); |
| }), values_.end()); |
| predicate_type_ = PredicateType::InList; |
| Simplify(); |
| return; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| void ColumnPredicate::MergeIntoEquality(const ColumnPredicate& other) { |
| CHECK(predicate_type_ == PredicateType::Equality); |
| |
| switch (other.predicate_type()) { |
| case PredicateType::None: { |
| SetToNone(); |
| return; |
| } |
| case PredicateType::Range: { |
| if (!other.CheckValueInRange(lower_)) { |
| // This equality value does not fall in the other range. |
| SetToNone(); |
| } |
| return; |
| }; |
| case PredicateType::Equality: { |
| if (column_.type_info()->Compare(lower_, other.lower_) != 0) { |
| SetToNone(); |
| } |
| return; |
| }; |
| case PredicateType::IsNotNull: return; |
| case PredicateType::InList : { |
| // The equality value needs to be a member of the InList |
| if (!other.CheckValueInList(lower_)) { |
| SetToNone(); |
| } |
| return; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| void ColumnPredicate::MergeIntoInList(const ColumnPredicate &other) { |
| CHECK(predicate_type_ == PredicateType::InList); |
| DCHECK(values_.size() > 1); |
| |
| switch (other.predicate_type()) { |
| case PredicateType::None : { |
| SetToNone(); |
| return; |
| }; |
| case PredicateType::Range: { |
| // Only values within the range should be retained. |
| auto search_by = [&] (const void* lhs, const void* rhs) { |
| return this->column_.type_info()->Compare(lhs, rhs) < 0; |
| }; |
| |
| // Remove all values greater than the range. |
| if (other.upper_ != nullptr) { |
| // lower_bound is used here instead of upper_bound, since the upper |
| // bound of the range is exclusive, and the in list is inclusive. |
| auto upper = std::lower_bound(values_.begin(), values_.end(), other.upper_, search_by); |
| values_.erase(upper, values_.end()); |
| } |
| |
| // Remove all values less than the range. |
| if (other.lower_ != nullptr) { |
| auto lower = std::lower_bound(values_.begin(), values_.end(), other.lower_, search_by); |
| values_.erase(values_.begin(), lower); |
| } |
| |
| Simplify(); |
| return; |
| } |
| case PredicateType::Equality: { |
| if (CheckValueInList(other.lower_)) { |
| // value falls in list so change to Equality predicate |
| predicate_type_ = PredicateType::Equality; |
| lower_ = other.lower_; |
| upper_ = nullptr; |
| } else { |
| SetToNone(); // Value does not fall in list |
| } |
| return; |
| } |
| case PredicateType::IsNotNull: return; |
| case PredicateType::InList: { |
| // Merge the 'other' IN list into this IN list. The basic idea is to loop |
| // through this predicate list, retaining only the values which are also |
| // contained in the other predicate list. We apply an optimization first: |
| // all values from this in list which fall outside the range of the other |
| // IN list are removed, and the remaining values in this IN list are only |
| // checked against values in the other list which are in this list's |
| // range. This doesn't reduce the worst-case complexity, but can really |
| // speed up the merge for big lists in certain cases. This optimization |
| // relies on the lists being sorted. |
| DCHECK(other.values_.size() > 1); |
| |
| auto search_by = [&] (const void* lhs, const void* rhs) { |
| return this->column_.type_info()->Compare(lhs, rhs) < 0; |
| }; |
| |
| // Remove all values in this IN list which are greater than the largest |
| // value in the other list. |
| values_.erase(std::upper_bound(values_.begin(), values_.end(), |
| other.values_.back(), search_by), |
| values_.end()); |
| |
| // Remove all values in this IN list which are less than the smallest |
| // value in the other list. |
| values_.erase(values_.begin(), |
| std::lower_bound(values_.begin(), values_.end(), |
| other.values_.front(), search_by)); |
| |
| if (values_.empty()) { |
| SetToNone(); |
| return; |
| } |
| |
| // Find the sublist in the other IN list which overlaps with this IN list. |
| auto other_start = std::lower_bound(other.values_.begin(), other.values_.end(), |
| values_.front(), search_by); |
| auto other_end = std::upper_bound(other_start, other.values_.end(), |
| values_.back(), search_by); |
| |
| // Returns true if the value is *not* present in the other list. |
| // Modifies other_start to point at the position of v in the other list. |
| auto other_absent = [&] (const void* v) { |
| other_start = std::lower_bound(other_start, other_end, v, search_by); |
| return this->column_.type_info()->Compare(v, *other_start) != 0; |
| }; |
| |
| // Iterate through the values_ list and remove elements that do not exist |
| // in the other list. |
| values_.erase(std::remove_if(values_.begin(), values_.end(), other_absent), values_.end()); |
| Simplify(); |
| return; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| namespace { |
| template <typename P> |
| void ApplyPredicate(const ColumnBlock& block, SelectionVector* sel, P p) { |
| if (block.is_nullable()) { |
| for (size_t i = 0; i < block.nrows(); i++) { |
| if (!sel->IsRowSelected(i)) continue; |
| const void* cell = block.nullable_cell_ptr(i); |
| if (cell == nullptr || !p(cell)) { |
| BitmapClear(sel->mutable_bitmap(), i); |
| } |
| } |
| } else { |
| for (size_t i = 0; i < block.nrows(); i++) { |
| if (!sel->IsRowSelected(i)) continue; |
| const void* cell = block.cell_ptr(i); |
| if (!p(cell)) { |
| BitmapClear(sel->mutable_bitmap(), i); |
| } |
| } |
| } |
| } |
| } // anonymous namespace |
| |
| template <DataType PhysicalType> |
| void ColumnPredicate::EvaluateForPhysicalType(const ColumnBlock& block, |
| SelectionVector* sel) const { |
| switch (predicate_type()) { |
| case PredicateType::Range: { |
| if (lower_ == nullptr) { |
| ApplyPredicate(block, sel, [this] (const void* cell) { |
| return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0; |
| }); |
| } else if (upper_ == nullptr) { |
| ApplyPredicate(block, sel, [this] (const void* cell) { |
| return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0; |
| }); |
| } else { |
| ApplyPredicate(block, sel, [this] (const void* cell) { |
| return DataTypeTraits<PhysicalType>::Compare(cell, this->upper_) < 0 && |
| DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) >= 0; |
| }); |
| } |
| return; |
| }; |
| case PredicateType::Equality: { |
| ApplyPredicate(block, sel, [this] (const void* cell) { |
| return DataTypeTraits<PhysicalType>::Compare(cell, this->lower_) == 0; |
| }); |
| return; |
| }; |
| case PredicateType::IsNotNull: { |
| if (!block.is_nullable()) return; |
| // TODO: make this more efficient by using bitwise operations on the |
| // null and selection vectors. |
| for (size_t i = 0; i < block.nrows(); i++) { |
| if (sel->IsRowSelected(i) && block.is_null(i)) { |
| BitmapClear(sel->mutable_bitmap(), i); |
| } |
| } |
| return; |
| }; |
| case PredicateType::InList: { |
| ApplyPredicate(block, sel, [this] (const void* cell) { |
| return std::binary_search(values_.begin(), values_.end(), cell, |
| [] (const void* lhs, const void* rhs) { |
| return DataTypeTraits<PhysicalType>::Compare(lhs, rhs) < 0; |
| }); |
| }); |
| return; |
| }; |
| case PredicateType::None: LOG(FATAL) << "NONE predicate evaluation"; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| void ColumnPredicate::Evaluate(const ColumnBlock& block, SelectionVector* sel) const { |
| DCHECK(sel); |
| switch (block.type_info()->physical_type()) { |
| case BOOL: return EvaluateForPhysicalType<BOOL>(block, sel); |
| case INT8: return EvaluateForPhysicalType<INT8>(block, sel); |
| case INT16: return EvaluateForPhysicalType<INT16>(block, sel); |
| case INT32: return EvaluateForPhysicalType<INT32>(block, sel); |
| case INT64: return EvaluateForPhysicalType<INT64>(block, sel); |
| case UINT8: return EvaluateForPhysicalType<UINT8>(block, sel); |
| case UINT16: return EvaluateForPhysicalType<UINT16>(block, sel); |
| case UINT32: return EvaluateForPhysicalType<UINT32>(block, sel); |
| case UINT64: return EvaluateForPhysicalType<UINT64>(block, sel); |
| case FLOAT: return EvaluateForPhysicalType<FLOAT>(block, sel); |
| case DOUBLE: return EvaluateForPhysicalType<DOUBLE>(block, sel); |
| case BINARY: return EvaluateForPhysicalType<BINARY>(block, sel); |
| default: LOG(FATAL) << "unknown physical type: " << block.type_info()->physical_type(); |
| } |
| } |
| |
| string ColumnPredicate::ToString() const { |
| switch (predicate_type()) { |
| case PredicateType::None: return strings::Substitute("`$0` NONE", column_.name()); |
| case PredicateType::Range: { |
| if (lower_ == nullptr) { |
| return strings::Substitute("`$0` < $1", column_.name(), column_.Stringify(upper_)); |
| } else if (upper_ == nullptr) { |
| return strings::Substitute("`$0` >= $1", column_.name(), column_.Stringify(lower_)); |
| } else { |
| return strings::Substitute("`$0` >= $1 AND `$0` < $2", |
| column_.name(), |
| column_.Stringify(lower_), |
| column_.Stringify(upper_)); |
| } |
| }; |
| case PredicateType::Equality: { |
| return strings::Substitute("`$0` = $1", column_.name(), column_.Stringify(lower_)); |
| }; |
| case PredicateType::IsNotNull: { |
| return strings::Substitute("`$0` IS NOT NULL", column_.name()); |
| }; |
| case PredicateType::InList: { |
| string ss = "`"; |
| ss.append(column_.name()); |
| ss.append("` IN ("); |
| bool is_first = true; |
| for (auto* value : values_) { |
| if (is_first) { |
| is_first = false; |
| } else { |
| ss.append(", "); |
| } |
| ss.append(column_.Stringify(value)); |
| } |
| ss.append(")"); |
| return ss; |
| }; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| bool ColumnPredicate::operator==(const ColumnPredicate& other) const { |
| if (!column_.Equals(other.column_, false)) { return false; } |
| if (predicate_type_ != other.predicate_type_) { |
| return false; |
| } |
| switch (predicate_type_) { |
| case PredicateType::Equality: return column_.type_info()->Compare(lower_, other.lower_) == 0; |
| case PredicateType::Range: { |
| return (lower_ == other.lower_ || |
| (lower_ != nullptr && other.lower_ != nullptr && |
| column_.type_info()->Compare(lower_, other.lower_) == 0)) && |
| (upper_ == other.upper_ || |
| (upper_ != nullptr && other.upper_ != nullptr && |
| column_.type_info()->Compare(upper_, other.upper_) == 0)); |
| }; |
| case PredicateType::InList: { |
| if (values_.size() != other.values_.size()) return false; |
| for (int i = 0; i < values_.size(); i++) { |
| if (column_.type_info()->Compare(values_[i], other.values_[i]) != 0) return false; |
| } |
| return true; |
| }; |
| case PredicateType::None: |
| case PredicateType::IsNotNull: return true; |
| } |
| LOG(FATAL) << "unknown predicate type"; |
| } |
| |
| bool ColumnPredicate::CheckValueInRange(const void* value) const { |
| CHECK(predicate_type_ == PredicateType::Range); |
| return ((lower_ == nullptr || column_.type_info()->Compare(lower_, value) <= 0) && |
| (upper_ == nullptr || column_.type_info()->Compare(upper_, value) > 0)); |
| } |
| |
| bool ColumnPredicate::CheckValueInList(const void* value) const { |
| return std::binary_search(values_.begin(), values_.end(), value, |
| [this](const void* lhs, const void* rhs) { |
| return this->column_.type_info()->Compare(lhs, rhs) < 0; |
| }); |
| } |
| |
| namespace { |
| int SelectivityRank(const ColumnPredicate& predicate) { |
| int rank; |
| switch (predicate.predicate_type()) { |
| case PredicateType::None: rank = 0; break; |
| case PredicateType::Equality: rank = 1; break; |
| case PredicateType::InList: rank = 2; break; |
| case PredicateType::Range: rank = 3; break; |
| case PredicateType::IsNotNull: rank = 4; break; |
| default: LOG(FATAL) << "unknown predicate type"; |
| } |
| return rank * (kLargestTypeSize + 1) + predicate.column().type_info()->size(); |
| } |
| } // anonymous namespace |
| |
| int SelectivityComparator(const ColumnPredicate& left, const ColumnPredicate& right) { |
| return SelectivityRank(left) - SelectivityRank(right); |
| } |
| |
| } // namespace kudu |