| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "PredicateLeaf.hh" |
| #include "orc/BloomFilter.hh" |
| #include "orc/Common.hh" |
| #include "orc/Type.hh" |
| |
| #include <algorithm> |
| #include <functional> |
| #include <sstream> |
| #include <type_traits> |
| |
| namespace orc { |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, |
| Literal literal) |
| : mOperator(op), mType(type), mColumnName(colName), mHasColumnName(true), mColumnId(0) { |
| mLiterals.emplace_back(literal); |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, |
| Literal literal) |
| : mOperator(op), mType(type), mHasColumnName(false), mColumnId(columnId) { |
| mLiterals.emplace_back(literal); |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, |
| const std::initializer_list<Literal>& literals) |
| : mOperator(op), |
| mType(type), |
| mColumnName(colName), |
| mHasColumnName(true), |
| mLiterals(literals.begin(), literals.end()) { |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, |
| const std::initializer_list<Literal>& literals) |
| : mOperator(op), |
| mType(type), |
| mHasColumnName(false), |
| mColumnId(columnId), |
| mLiterals(literals.begin(), literals.end()) { |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, const std::string& colName, |
| const std::vector<Literal>& literals) |
| : mOperator(op), |
| mType(type), |
| mColumnName(colName), |
| mHasColumnName(true), |
| mLiterals(literals.begin(), literals.end()) { |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| PredicateLeaf::PredicateLeaf(Operator op, PredicateDataType type, uint64_t columnId, |
| const std::vector<Literal>& literals) |
| : mOperator(op), |
| mType(type), |
| mHasColumnName(false), |
| mColumnId(columnId), |
| mLiterals(literals.begin(), literals.end()) { |
| mHashCode = hashCode(); |
| validate(); |
| } |
| |
| void PredicateLeaf::validateColumn() const { |
| if (mHasColumnName && mColumnName.empty()) { |
| throw std::invalid_argument("column name should not be empty"); |
| } else if (!mHasColumnName && mColumnId == INVALID_COLUMN_ID) { |
| throw std::invalid_argument("invalid column id"); |
| } |
| } |
| |
| void PredicateLeaf::validate() const { |
| switch (mOperator) { |
| case Operator::IS_NULL: |
| validateColumn(); |
| if (!mLiterals.empty()) { |
| throw std::invalid_argument("No literal is required!"); |
| } |
| break; |
| case Operator::EQUALS: |
| case Operator::NULL_SAFE_EQUALS: |
| case Operator::LESS_THAN: |
| case Operator::LESS_THAN_EQUALS: |
| validateColumn(); |
| if (mLiterals.size() != 1) { |
| throw std::invalid_argument("One literal is required!"); |
| } |
| if (static_cast<int>(mLiterals.at(0).getType()) != static_cast<int>(mType)) { |
| throw std::invalid_argument("leaf and literal types do not match!"); |
| } |
| break; |
| case Operator::IN: |
| validateColumn(); |
| if (mLiterals.size() < 2) { |
| throw std::invalid_argument("At least two literals are required!"); |
| } |
| for (auto literal : mLiterals) { |
| if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) { |
| throw std::invalid_argument("leaf and literal types do not match!"); |
| } |
| } |
| break; |
| case Operator::BETWEEN: |
| validateColumn(); |
| for (auto literal : mLiterals) { |
| if (static_cast<int>(literal.getType()) != static_cast<int>(mType)) { |
| throw std::invalid_argument("leaf and literal types do not match!"); |
| } |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| |
| PredicateLeaf::Operator PredicateLeaf::getOperator() const { |
| return mOperator; |
| } |
| |
| PredicateDataType PredicateLeaf::getType() const { |
| return mType; |
| } |
| |
| bool PredicateLeaf::hasColumnName() const { |
| return mHasColumnName; |
| } |
| |
| /** |
| * Get the simple column name. |
| */ |
| const std::string& PredicateLeaf::getColumnName() const { |
| return mColumnName; |
| } |
| |
| uint64_t PredicateLeaf::getColumnId() const { |
| return mColumnId; |
| } |
| |
| /** |
| * Get the literal half of the predicate leaf. |
| */ |
| Literal PredicateLeaf::getLiteral() const { |
| return mLiterals.at(0); |
| } |
| |
| /** |
| * For operators with multiple literals (IN and BETWEEN), get the literals. |
| */ |
| const std::vector<Literal>& PredicateLeaf::getLiteralList() const { |
| return mLiterals; |
| } |
| |
| static std::string getLiteralString(const std::vector<Literal>& literals) { |
| return literals.at(0).toString(); |
| } |
| |
| static std::string getLiteralsString(const std::vector<Literal>& literals) { |
| std::ostringstream sstream; |
| sstream << "["; |
| for (size_t i = 0; i != literals.size(); ++i) { |
| sstream << literals[i].toString(); |
| if (i + 1 != literals.size()) { |
| sstream << ", "; |
| } |
| } |
| sstream << "]"; |
| return sstream.str(); |
| } |
| |
| std::string PredicateLeaf::columnDebugString() const { |
| if (mHasColumnName) return mColumnName; |
| std::ostringstream sstream; |
| sstream << "column(id=" << mColumnId << ')'; |
| return sstream.str(); |
| } |
| |
| std::string PredicateLeaf::toString() const { |
| std::ostringstream sstream; |
| sstream << '('; |
| switch (mOperator) { |
| case Operator::IS_NULL: |
| sstream << columnDebugString() << " is null"; |
| break; |
| case Operator::EQUALS: |
| sstream << columnDebugString() << " = " << getLiteralString(mLiterals); |
| break; |
| case Operator::NULL_SAFE_EQUALS: |
| sstream << columnDebugString() << " null_safe_= " << getLiteralString(mLiterals); |
| break; |
| case Operator::LESS_THAN: |
| sstream << columnDebugString() << " < " << getLiteralString(mLiterals); |
| break; |
| case Operator::LESS_THAN_EQUALS: |
| sstream << columnDebugString() << " <= " << getLiteralString(mLiterals); |
| break; |
| case Operator::IN: |
| sstream << columnDebugString() << " in " << getLiteralsString(mLiterals); |
| break; |
| case Operator::BETWEEN: |
| sstream << columnDebugString() << " between " << getLiteralsString(mLiterals); |
| break; |
| default: |
| sstream << "unknown operator, column: " << columnDebugString() |
| << ", literals: " << getLiteralsString(mLiterals); |
| } |
| sstream << ')'; |
| return sstream.str(); |
| } |
| |
| size_t PredicateLeaf::hashCode() const { |
| size_t value = 0; |
| std::for_each(mLiterals.cbegin(), mLiterals.cend(), |
| [&](const Literal& literal) { value = value * 17 + literal.getHashCode(); }); |
| auto colHash = |
| mHasColumnName ? std::hash<std::string>{}(mColumnName) : std::hash<uint64_t>{}(mColumnId); |
| return value * 103 * 101 * 3 * 17 + std::hash<int>{}(static_cast<int>(mOperator)) + |
| std::hash<int>{}(static_cast<int>(mType)) * 17 + colHash * 3 * 17; |
| } |
| |
| bool PredicateLeaf::operator==(const PredicateLeaf& r) const { |
| if (this == &r) { |
| return true; |
| } |
| if (mHashCode != r.mHashCode || mType != r.mType || mOperator != r.mOperator || |
| mHasColumnName != r.mHasColumnName || mColumnName != r.mColumnName || |
| mColumnId != r.mColumnId || mLiterals.size() != r.mLiterals.size()) { |
| return false; |
| } |
| for (size_t i = 0; i != mLiterals.size(); ++i) { |
| if (mLiterals[i] != r.mLiterals[i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // enum to mark the position of predicate in the range |
| enum class Location { BEFORE, MIN, MIDDLE, MAX, AFTER }; |
| |
| DIAGNOSTIC_PUSH |
| DIAGNOSTIC_IGNORE("-Wfloat-equal") |
| |
| /** |
| * Given a point and min and max, determine if the point is before, at the |
| * min, in the middle, at the max, or after the range. |
| * @param point the point to test |
| * @param min the minimum point |
| * @param max the maximum point |
| * @return the location of the point |
| */ |
| template <typename T> |
| Location compareToRange(const T& point, const T& min, const T& max) { |
| if (point < min) { |
| return Location::BEFORE; |
| } else if (point == min) { |
| return Location::MIN; |
| } |
| |
| if (point > max) { |
| return Location::AFTER; |
| } else if (point == max) { |
| return Location::MAX; |
| } |
| |
| return Location::MIDDLE; |
| } |
| |
| /** |
| * Evaluate a predicate leaf according to min/max values |
| * @param op operator of the predicate |
| * @param values the value to test |
| * @param minValue the minimum value |
| * @param maxValue the maximum value |
| * @param hasNull whether the statistics contain null |
| * @return the TruthValue result of the test |
| */ |
| template <typename T> |
| TruthValue evaluatePredicateRange(const PredicateLeaf::Operator op, const std::vector<T>& values, |
| const T& minValue, const T& maxValue, bool hasNull) { |
| Location loc; |
| switch (op) { |
| case PredicateLeaf::Operator::NULL_SAFE_EQUALS: |
| loc = compareToRange(values.at(0), minValue, maxValue); |
| if (loc == Location::BEFORE || loc == Location::AFTER) { |
| return TruthValue::NO; |
| } else { |
| return TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::EQUALS: |
| loc = compareToRange(values.at(0), minValue, maxValue); |
| if (minValue == maxValue && loc == Location::MIN) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } else if (loc == Location::BEFORE || loc == Location::AFTER) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::LESS_THAN: |
| loc = compareToRange(values.at(0), minValue, maxValue); |
| if (loc == Location::AFTER) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } else if (loc == Location::BEFORE || loc == Location::MIN) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::LESS_THAN_EQUALS: |
| loc = compareToRange(values.at(0), minValue, maxValue); |
| if (loc == Location::AFTER || loc == Location::MAX || |
| (loc == Location::MIN && minValue == maxValue)) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } else if (loc == Location::BEFORE) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::IN: |
| if (minValue == maxValue) { |
| // for a single value, look through to see if that value is in the set |
| for (auto& value : values) { |
| loc = compareToRange(value, minValue, maxValue); |
| if (loc == Location::MIN) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } |
| } |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| // are all of the values outside of the range? |
| for (auto& value : values) { |
| loc = compareToRange(value, minValue, maxValue); |
| if (loc == Location::MIN || loc == Location::MIDDLE || loc == Location::MAX) { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| } |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } |
| case PredicateLeaf::Operator::BETWEEN: |
| if (values.empty()) { |
| return TruthValue::YES_NO; |
| } |
| loc = compareToRange(values.at(0), minValue, maxValue); |
| if (loc == Location::BEFORE || loc == Location::MIN) { |
| Location loc2 = compareToRange(values.at(1), minValue, maxValue); |
| if (loc2 == Location::AFTER || loc2 == Location::MAX) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } else if (loc2 == Location::BEFORE) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| } else if (loc == Location::AFTER) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else { |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::IS_NULL: |
| // min = null condition above handles the all-nulls YES case |
| return hasNull ? TruthValue::YES_NO : TruthValue::NO; |
| default: |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| } |
| |
| DIAGNOSTIC_POP |
| |
| static TruthValue evaluateBoolPredicate(const PredicateLeaf::Operator op, |
| const std::vector<Literal>& literals, |
| const proto::ColumnStatistics& stats) { |
| bool hasNull = stats.hasnull(); |
| if (!stats.has_bucketstatistics() || stats.bucketstatistics().count_size() == 0) { |
| // does not have bool stats |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| |
| auto trueCount = stats.bucketstatistics().count(0); |
| auto falseCount = stats.numberofvalues() - trueCount; |
| switch (op) { |
| case PredicateLeaf::Operator::IS_NULL: |
| return hasNull ? TruthValue::YES_NO : TruthValue::NO; |
| case PredicateLeaf::Operator::NULL_SAFE_EQUALS: { |
| if (literals.at(0).getBool()) { |
| if (trueCount == 0) { |
| return TruthValue::NO; |
| } else if (falseCount == 0) { |
| return TruthValue::YES; |
| } |
| } else { |
| if (falseCount == 0) { |
| return TruthValue::NO; |
| } else if (trueCount == 0) { |
| return TruthValue::YES; |
| } |
| } |
| return TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::EQUALS: { |
| if (literals.at(0).getBool()) { |
| if (trueCount == 0) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else if (falseCount == 0) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } |
| } else { |
| if (falseCount == 0) { |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| } else if (trueCount == 0) { |
| return hasNull ? TruthValue::YES_NULL : TruthValue::YES; |
| } |
| } |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| case PredicateLeaf::Operator::LESS_THAN: |
| case PredicateLeaf::Operator::LESS_THAN_EQUALS: |
| case PredicateLeaf::Operator::IN: |
| case PredicateLeaf::Operator::BETWEEN: |
| default: |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| } |
| |
| static std::vector<int64_t> literal2Long(const std::vector<Literal>& values) { |
| std::vector<int64_t> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getLong()); |
| } |
| }); |
| return result; |
| } |
| |
| static std::vector<int32_t> literal2Date(const std::vector<Literal>& values) { |
| std::vector<int32_t> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getDate()); |
| } |
| }); |
| return result; |
| } |
| |
| static std::vector<Literal::Timestamp> literal2Timestamp(const std::vector<Literal>& values) { |
| std::vector<Literal::Timestamp> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getTimestamp()); |
| } |
| }); |
| return result; |
| } |
| |
| static std::vector<Decimal> literal2Decimal(const std::vector<Literal>& values) { |
| std::vector<Decimal> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getDecimal()); |
| } |
| }); |
| return result; |
| } |
| |
| static std::vector<double> literal2Double(const std::vector<Literal>& values) { |
| std::vector<double> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getFloat()); |
| } |
| }); |
| return result; |
| } |
| |
| static std::vector<std::string> literal2String(const std::vector<Literal>& values) { |
| std::vector<std::string> result; |
| std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) { |
| if (!val.isNull()) { |
| result.emplace_back(val.getString()); |
| } |
| }); |
| return result; |
| } |
| |
| TruthValue PredicateLeaf::evaluatePredicateMinMax(const proto::ColumnStatistics& colStats) const { |
| TruthValue result = TruthValue::YES_NO_NULL; |
| switch (mType) { |
| case PredicateDataType::LONG: { |
| if (colStats.has_intstatistics() && colStats.intstatistics().has_minimum() && |
| colStats.intstatistics().has_maximum()) { |
| const auto& stats = colStats.intstatistics(); |
| result = evaluatePredicateRange(mOperator, literal2Long(mLiterals), stats.minimum(), |
| stats.maximum(), colStats.hasnull()); |
| } |
| break; |
| } |
| case PredicateDataType::FLOAT: { |
| if (colStats.has_doublestatistics() && colStats.doublestatistics().has_minimum() && |
| colStats.doublestatistics().has_maximum()) { |
| const auto& stats = colStats.doublestatistics(); |
| if (!std::isfinite(stats.sum())) { |
| result = colStats.hasnull() ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } else { |
| result = evaluatePredicateRange(mOperator, literal2Double(mLiterals), stats.minimum(), |
| stats.maximum(), colStats.hasnull()); |
| } |
| } |
| break; |
| } |
| case PredicateDataType::STRING: { |
| /// TODO: check lowerBound and upperBound as well |
| if (colStats.has_stringstatistics() && colStats.stringstatistics().has_minimum() && |
| colStats.stringstatistics().has_maximum()) { |
| const auto& stats = colStats.stringstatistics(); |
| result = evaluatePredicateRange(mOperator, literal2String(mLiterals), stats.minimum(), |
| stats.maximum(), colStats.hasnull()); |
| } |
| break; |
| } |
| case PredicateDataType::DATE: { |
| if (colStats.has_datestatistics() && colStats.datestatistics().has_minimum() && |
| colStats.datestatistics().has_maximum()) { |
| const auto& stats = colStats.datestatistics(); |
| result = evaluatePredicateRange(mOperator, literal2Date(mLiterals), stats.minimum(), |
| stats.maximum(), colStats.hasnull()); |
| } |
| break; |
| } |
| case PredicateDataType::TIMESTAMP: { |
| if (colStats.has_timestampstatistics() && colStats.timestampstatistics().has_minimumutc() && |
| colStats.timestampstatistics().has_maximumutc()) { |
| const auto& stats = colStats.timestampstatistics(); |
| constexpr int32_t DEFAULT_MIN_NANOS = 0; |
| constexpr int32_t DEFAULT_MAX_NANOS = 999999; |
| int32_t minNano = stats.has_minimumnanos() ? stats.minimumnanos() - 1 : DEFAULT_MIN_NANOS; |
| int32_t maxNano = stats.has_maximumnanos() ? stats.maximumnanos() - 1 : DEFAULT_MAX_NANOS; |
| Literal::Timestamp minTimestamp( |
| stats.minimumutc() / 1000, |
| static_cast<int32_t>((stats.minimumutc() % 1000) * 1000000) + minNano); |
| Literal::Timestamp maxTimestamp( |
| stats.maximumutc() / 1000, |
| static_cast<int32_t>((stats.maximumutc() % 1000) * 1000000) + maxNano); |
| result = evaluatePredicateRange(mOperator, literal2Timestamp(mLiterals), minTimestamp, |
| maxTimestamp, colStats.hasnull()); |
| } |
| break; |
| } |
| case PredicateDataType::DECIMAL: { |
| if (colStats.has_decimalstatistics() && colStats.decimalstatistics().has_minimum() && |
| colStats.decimalstatistics().has_maximum()) { |
| const auto& stats = colStats.decimalstatistics(); |
| result = evaluatePredicateRange(mOperator, literal2Decimal(mLiterals), |
| Decimal(stats.minimum()), Decimal(stats.maximum()), |
| colStats.hasnull()); |
| } |
| break; |
| } |
| case PredicateDataType::BOOLEAN: { |
| if (colStats.has_bucketstatistics()) { |
| result = evaluateBoolPredicate(mOperator, mLiterals, colStats); |
| } |
| break; |
| } |
| default: |
| break; |
| } |
| |
| // make sure null literal is respected for IN operator |
| if (mOperator == Operator::IN && colStats.hasnull()) { |
| for (const auto& literal : mLiterals) { |
| if (literal.isNull()) { |
| result = TruthValue::YES_NO_NULL; |
| break; |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| static bool shouldEvaluateBloomFilter(PredicateLeaf::Operator op, TruthValue result, |
| const BloomFilter* bloomFilter) { |
| // evaluate bloom filter only when |
| // 1) Bloom filter is available |
| // 2) Min/Max evaluation yield YES or MAYBE |
| // 3) Predicate is EQUALS or IN list |
| // 4) Decimal type stores its string representation |
| // but has inconsistency in trailing zeros |
| if (bloomFilter != nullptr && result != TruthValue::NO_NULL && result != TruthValue::NO && |
| (op == PredicateLeaf::Operator::EQUALS || op == PredicateLeaf::Operator::NULL_SAFE_EQUALS || |
| op == PredicateLeaf::Operator::IN)) { |
| return true; |
| } |
| return false; |
| } |
| |
| static TruthValue checkInBloomFilter(PredicateLeaf::Operator, PredicateDataType type, |
| const Literal& literal, const BloomFilter* bf, |
| bool hasNull) { |
| TruthValue result = hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| if (literal.isNull()) { |
| result = hasNull ? TruthValue::YES_NO_NULL : TruthValue::NO; |
| } else if (type == PredicateDataType::LONG) { |
| if (bf->testLong(literal.getLong())) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else if (type == PredicateDataType::FLOAT) { |
| if (bf->testDouble(literal.getFloat())) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else if (type == PredicateDataType::STRING) { |
| std::string str = literal.getString(); |
| if (bf->testBytes(str.c_str(), static_cast<int64_t>(str.size()))) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else if (type == PredicateDataType::DECIMAL) { |
| std::string decimal = literal.getDecimal().toString(true); |
| if (bf->testBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()))) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else if (type == PredicateDataType::TIMESTAMP) { |
| if (bf->testLong(literal.getTimestamp().getMillis())) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else if (type == PredicateDataType::DATE) { |
| if (bf->testLong(literal.getDate())) { |
| result = TruthValue::YES_NO_NULL; |
| } |
| } else { |
| result = TruthValue::YES_NO_NULL; |
| } |
| |
| if (result == TruthValue::YES_NO_NULL && !hasNull) { |
| result = TruthValue::YES_NO; |
| } |
| |
| return result; |
| } |
| |
| TruthValue PredicateLeaf::evaluatePredicateBloomFiter(const BloomFilter* bf, bool hasNull) const { |
| switch (mOperator) { |
| case Operator::NULL_SAFE_EQUALS: |
| // null safe equals does not return *_NULL variant. |
| // So set hasNull to false |
| return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, false); |
| case Operator::EQUALS: |
| return checkInBloomFilter(mOperator, mType, mLiterals.front(), bf, hasNull); |
| case Operator::IN: |
| for (const auto& literal : mLiterals) { |
| // if at least one value in IN list exist in bloom filter, |
| // qualify the row group/stripe |
| TruthValue result = checkInBloomFilter(mOperator, mType, literal, bf, hasNull); |
| if (result == TruthValue::YES_NO_NULL || result == TruthValue::YES_NO) { |
| return result; |
| } |
| } |
| return hasNull ? TruthValue::NO_NULL : TruthValue::NO; |
| case Operator::LESS_THAN: |
| case Operator::LESS_THAN_EQUALS: |
| case Operator::BETWEEN: |
| case Operator::IS_NULL: |
| default: |
| return hasNull ? TruthValue::YES_NO_NULL : TruthValue::YES_NO; |
| } |
| } |
| |
| TruthValue PredicateLeaf::evaluate(const WriterVersion writerVersion, |
| const proto::ColumnStatistics& colStats, |
| const BloomFilter* bloomFilter) const { |
| // files written before ORC-135 stores timestamp wrt to local timezone |
| // causing issues with PPD. disable PPD for timestamp for all old files |
| if (mType == PredicateDataType::TIMESTAMP) { |
| if (writerVersion < WriterVersion::WriterVersion_ORC_135) { |
| return TruthValue::YES_NO_NULL; |
| } |
| } |
| |
| bool allNull = colStats.hasnull() && colStats.numberofvalues() == 0; |
| if (mOperator == Operator::IS_NULL || |
| ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) && |
| mLiterals.at(0).isNull())) { |
| // IS_NULL operator does not need to check min/max stats and bloom filter |
| return allNull ? TruthValue::YES : (colStats.hasnull() ? TruthValue::YES_NO : TruthValue::NO); |
| } else if (allNull) { |
| // if we don't have any value, everything must have been null |
| return TruthValue::IS_NULL; |
| } |
| |
| TruthValue result = evaluatePredicateMinMax(colStats); |
| if (shouldEvaluateBloomFilter(mOperator, result, bloomFilter)) { |
| return evaluatePredicateBloomFiter(bloomFilter, colStats.hasnull()); |
| } else { |
| return result; |
| } |
| } |
| |
| } // namespace orc |