blob: 79fea4ed5f018a4b1e2f24caf626594b24b69435 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef THETA_INTERSECTION_IMPL_HPP_
#define THETA_INTERSECTION_IMPL_HPP_
#include <algorithm>
namespace datasketches {
/*
* author Alexander Saydakov
* author Lee Rhodes
* author Kevin Lang
*/
template<typename A>
theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
is_valid_(false),
is_empty_(false),
theta_(theta_sketch_alloc<A>::MAX_THETA),
lg_size_(0),
keys_(),
num_keys_(0),
seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
{}
template<typename A>
void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
if (is_empty_) return;
if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
is_empty_ |= sketch.is_empty();
theta_ = std::min(theta_, sketch.get_theta64());
if (is_valid_ && num_keys_ == 0) return;
if (sketch.get_num_retained() == 0) {
is_valid_ = true;
if (keys_.size() > 0) {
keys_.resize(0);
lg_size_ = 0;
num_keys_ = 0;
}
return;
}
if (!is_valid_) { // first update, clone incoming sketch
is_valid_ = true;
lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
keys_.resize(1 << lg_size_, 0);
for (auto key: sketch) {
if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
}
++num_keys_;
}
if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
} else { // intersection
const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
vector_u64<A> matched_keys(max_matches);
uint32_t match_count = 0;
uint32_t count = 0;
for (auto key: sketch) {
if (key < theta_) {
if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
matched_keys[match_count++] = key;
}
} else if (sketch.is_ordered()) {
break; // early stop
}
++count;
}
if (count > sketch.get_num_retained()) {
throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
} else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
}
if (match_count == 0) {
keys_.resize(0);
lg_size_ = 0;
num_keys_ = 0;
if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
} else {
const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
if (lg_size != lg_size_) {
lg_size_ = lg_size;
keys_.resize(1 << lg_size_);
}
std::fill(&keys_[0], &keys_[1 << lg_size_], 0);
for (uint32_t i = 0; i < match_count; i++) {
update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
}
num_keys_ = match_count;
}
}
}
template<typename A>
compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
vector_u64<A> keys(num_keys_);
if (num_keys_ > 0) {
std::copy_if(&keys_[0], &keys_[1 << lg_size_], &keys[0], [](uint64_t key) { return key != 0; });
if (ordered) std::sort(keys.begin(), keys.end());
}
return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
}
template<typename A>
bool theta_intersection_alloc<A>::has_result() const {
return is_valid_;
}
} /* namespace datasketches */
# endif