Merge pull request #155 from apache/patch_for_rc4
Apply changes to create rc4
diff --git a/theta/CMakeLists.txt b/theta/CMakeLists.txt
index ae02e79..69af9db 100644
--- a/theta/CMakeLists.txt
+++ b/theta/CMakeLists.txt
@@ -36,6 +36,7 @@
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/binomial_bounds.hpp;include/theta_sketch_impl.hpp")
list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
+list(APPEND theta_HEADERS "include/conditional_back_inserter.hpp")
install(TARGETS theta
EXPORT ${PROJECT_NAME}
@@ -55,4 +56,5 @@
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
)
diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
new file mode 100644
index 0000000..9b33833
--- /dev/null
+++ b/theta/include/conditional_back_inserter.hpp
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CONDITIONAL_BACK_INSERTER_HPP_
+#define CONDITIONAL_BACK_INSERTER_HPP_
+
+#include <iterator>
+#include <functional>
+
+namespace datasketches {
+
+template <typename Container, typename Predicate>
+class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
+public:
+ template<typename P>
+ conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
+
+ // MSVC seems to insist on having copy constructor and assignment
+ conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
+ std::back_insert_iterator<Container>(other), p(other.p) {}
+ conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
+ std::back_insert_iterator<Container>::operator=(other);
+ p = other.p;
+ return *this;
+ }
+
+ conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
+ if (p(value)) std::back_insert_iterator<Container>::operator=(value);
+ return *this;
+ }
+
+ conditional_back_insert_iterator& operator*() { return *this; }
+ conditional_back_insert_iterator& operator++() { return *this; }
+ conditional_back_insert_iterator& operator++(int) { return *this; }
+
+private:
+ Predicate p;
+};
+
+template<typename Container, typename Predicate>
+conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
+ return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
index 5287887..db66ac7 100644
--- a/theta/include/theta_a_not_b.hpp
+++ b/theta/include/theta_a_not_b.hpp
@@ -54,6 +54,13 @@
typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
uint16_t seed_hash_;
+ class less_than {
+ public:
+ explicit less_than(uint64_t value): value(value) {}
+ bool operator()(uint64_t value) const { return value < this->value; }
+ private:
+ uint64_t value;
+ };
};
// alias with default allocator for convenience
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index cc171ce..4343ee3 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -22,6 +22,8 @@
#include <algorithm>
+#include "conditional_back_inserter.hpp"
+
namespace datasketches {
/*
@@ -37,61 +39,42 @@
template<typename A>
compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
- if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+ if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
- if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
vector_u64<A> keys;
- uint32_t keys_size = 0;
- uint32_t count = 0;
bool is_empty = a.is_empty();
if (b.get_num_retained() == 0) {
- for (auto key: a) if (key < theta) ++count;
- keys.resize(count);
- std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
- }
-
- keys_size = a.get_num_retained();
- keys.resize(keys_size);
-
- if (a.is_ordered() && b.is_ordered()) { // sort-based
- const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
- count = end - keys.begin();
- } else { // hash-based
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- vector_u64<A> b_hash_table(1 << lg_size, 0);
- for (auto key: b) {
- if (key < theta) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
- } else if (b.is_ordered()) {
- break; // early stop
+ std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
+ } else {
+ if (a.is_ordered() && b.is_ordered()) { // sort-based
+ std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
+ } else { // hash-based
+ const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+ vector_u64<A> b_hash_table(1 << lg_size, 0);
+ for (auto key: b) {
+ if (key < theta) {
+ update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
+ } else if (b.is_ordered()) {
+ break; // early stop
+ }
}
- }
- // scan A lookup B
- for (auto key: a) {
- if (key < theta) {
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
- } else if (a.is_ordered()) {
- break; // early stop
+ // scan A lookup B
+ for (auto key: a) {
+ if (key < theta) {
+ if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
+ } else if (a.is_ordered()) {
+ break; // early stop
+ }
}
}
}
-
- if (count == 0) {
- keys.resize(0);
- if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- } else if (count < keys_size) {
- keys.resize(count);
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- }
-
+ if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+ if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
}
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 79fea4e..6be6757 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -44,7 +44,7 @@
template<typename A>
void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
if (is_empty_) return;
- if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
+ if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
is_empty_ |= sketch.is_empty();
theta_ = std::min(theta_, sketch.get_theta64());
if (is_valid_ && num_keys_ == 0) return;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 417dfa1..0514884 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -101,9 +101,9 @@
is.read((char*)&seed_hash, sizeof(seed_hash));
check_serial_version(serial_version, SERIAL_VERSION);
- check_seed_hash(seed_hash, get_seed_hash(seed));
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
@@ -114,6 +114,8 @@
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
@@ -146,9 +148,9 @@
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
check_serial_version(serial_version, SERIAL_VERSION);
- check_seed_hash(seed_hash, get_seed_hash(seed));
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ check_seed_hash(seed_hash, get_seed_hash(seed));
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
return unique_ptr(
@@ -161,6 +163,8 @@
}
);
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
return unique_ptr(
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
@@ -753,7 +757,8 @@
is.read((char*)&seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
}
@@ -801,7 +806,8 @@
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+ const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
}
diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp
index 54bd3fa..1ef5255 100644
--- a/theta/test/theta_a_not_b_test.cpp
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -217,4 +217,28 @@
REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
}
+TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 10000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ value = 5000;
+ for (int i = 0; i < 25000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ REQUIRE_FALSE(result.is_empty());
+ REQUIRE(result.is_estimation_mode());
+ REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+}
+
} /* namespace datasketches */
diff --git a/theta/test/theta_compact_empty_from_java.sk b/theta/test/theta_compact_empty_from_java.sk
index 44730d3..f6c647f 100644
--- a/theta/test/theta_compact_empty_from_java.sk
+++ b/theta/test/theta_compact_empty_from_java.sk
Binary files differ