blob: 4fd7320abdb3cf5d2c6651e3546f1362b51ed9a8 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
**/
#include "query_optimizer/ExecutionHeuristics.hpp"
#include <cstddef>
#include <utility>
#include <unordered_map>
#include <vector>
#include "catalog/CatalogTypedefs.hpp"
#include "query_execution/QueryContext.pb.h"
#include "query_optimizer/QueryPlan.hpp"
#include "utility/Macros.hpp"
#include "glog/logging.h"
namespace quickstep {
namespace optimizer {
void ExecutionHeuristics::optimizeExecutionPlan(QueryPlan *query_plan,
serialization::QueryContext *query_context_proto) {
// Currently this only optimizes left deep joins using bloom filters.
// It uses a simple algorithm to discover the left deep joins.
// It starts with the first hash join in the plan and keeps on iterating
// over the next hash joins, till a probe on a different relation id is found.
// The set of hash joins found in this way forms a chain and can be recognized
// as a left deep join. It becomes a candidate for optimization.
// The optimization is done by modifying each of the build operators in the chain
// to generate a bloom filter on the build key during their hash table creation.
// The leaf-level probe operator is then modified to query all the bloom
// filters generated from all the build operators in the chain. These
// bloom filters are queried to test the membership of the probe key
// just prior to probing the hash table.
QueryPlan::DAGNodeIndex origin_node = 0;
while (origin_node < hash_joins_.size() - 1) {
std::vector<std::size_t> chained_nodes;
chained_nodes.push_back(origin_node);
for (std::size_t i = origin_node + 1; i < hash_joins_.size(); ++i) {
const relation_id checked_relation_id = hash_joins_[origin_node].referenced_stored_probe_relation_->getID();
const relation_id expected_relation_id = hash_joins_[i].referenced_stored_probe_relation_->getID();
if (checked_relation_id == expected_relation_id) {
chained_nodes.push_back(i);
} else {
break;
}
}
// Only chains of length greater than one are suitable candidates for semi-join optimization.
if (chained_nodes.size() > 1) {
std::unordered_map<QueryContext::bloom_filter_id, std::vector<attribute_id>> probe_bloom_filter_info;
for (const std::size_t node : chained_nodes) {
// Provision for a new bloom filter to be used by the build operator.
const QueryContext::bloom_filter_id bloom_filter_id = query_context_proto->bloom_filters_size();
serialization::BloomFilter *bloom_filter_proto = query_context_proto->add_bloom_filters();
// Modify the bloom filter properties based on the statistics of the relation.
setBloomFilterProperties(bloom_filter_proto, hash_joins_[node].referenced_stored_build_relation_);
// Add build-side bloom filter information to the corresponding hash table proto.
query_context_proto->mutable_join_hash_tables(hash_joins_[node].join_hash_table_id_)
->add_build_side_bloom_filter_id(bloom_filter_id);
probe_bloom_filter_info.insert(std::make_pair(bloom_filter_id, hash_joins_[node].probe_attributes_));
}
// Add probe-side bloom filter information to the corresponding hash table proto for each build-side bloom filter.
for (const std::pair<QueryContext::bloom_filter_id, std::vector<attribute_id>>
&bloom_filter_info : probe_bloom_filter_info) {
auto *probe_side_bloom_filter =
query_context_proto->mutable_join_hash_tables(hash_joins_[origin_node].join_hash_table_id_)
->add_probe_side_bloom_filters();
probe_side_bloom_filter->set_probe_side_bloom_filter_id(bloom_filter_info.first);
for (const attribute_id &probe_attribute_id : bloom_filter_info.second) {
probe_side_bloom_filter->add_probe_side_attr_ids(probe_attribute_id);
}
}
// Add node dependencies from chained build nodes to origin node probe.
for (std::size_t i = 1; i < chained_nodes.size(); ++i) { // Note: It starts from index 1.
query_plan->addDirectDependency(hash_joins_[origin_node].join_operator_index_,
hash_joins_[origin_node + i].build_operator_index_,
true /* is_pipeline_breaker */);
}
}
// Update the origin node.
origin_node = chained_nodes.back() + 1;
}
}
void ExecutionHeuristics::setBloomFilterProperties(serialization::BloomFilter *bloom_filter_proto,
const CatalogRelation *relation) {
const std::size_t cardinality = relation->estimateTupleCardinality();
if (cardinality < kOneThousand) {
bloom_filter_proto->set_bloom_filter_size(kOneThousand / kCompressionFactor);
bloom_filter_proto->set_number_of_hashes(kVeryLowSparsityHash);
} else if (cardinality < kTenThousand) {
bloom_filter_proto->set_bloom_filter_size(kTenThousand / kCompressionFactor);
bloom_filter_proto->set_number_of_hashes(kLowSparsityHash);
} else if (cardinality < kHundredThousand) {
bloom_filter_proto->set_bloom_filter_size(kHundredThousand / kCompressionFactor);
bloom_filter_proto->set_number_of_hashes(kMediumSparsityHash);
} else {
bloom_filter_proto->set_bloom_filter_size(kMillion / kCompressionFactor);
bloom_filter_proto->set_number_of_hashes(kHighSparsityHash);
}
}
} // namespace optimizer
} // namespace quickstep