Merge pull request #155 from apache/patch_for_rc4

Apply changes to create rc4
diff --git a/theta/CMakeLists.txt b/theta/CMakeLists.txt
index ae02e79..69af9db 100644
--- a/theta/CMakeLists.txt
+++ b/theta/CMakeLists.txt
@@ -36,6 +36,7 @@
 list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_union.hpp;include/theta_intersection.hpp")
 list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/binomial_bounds.hpp;include/theta_sketch_impl.hpp")
 list(APPEND theta_HEADERS "include/theta_union_impl.hpp;include/theta_intersection_impl.hpp;include/theta_a_not_b_impl.hpp")
+list(APPEND theta_HEADERS "include/conditional_back_inserter.hpp")
 
 install(TARGETS theta
   EXPORT ${PROJECT_NAME}
@@ -55,4 +56,5 @@
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/conditional_back_inserter.hpp
 )
diff --git a/theta/include/conditional_back_inserter.hpp b/theta/include/conditional_back_inserter.hpp
new file mode 100644
index 0000000..9b33833
--- /dev/null
+++ b/theta/include/conditional_back_inserter.hpp
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef CONDITIONAL_BACK_INSERTER_HPP_
+#define CONDITIONAL_BACK_INSERTER_HPP_
+
+#include <iterator>
+#include <functional>
+
+namespace datasketches {
+
+template <typename Container, typename Predicate>
+class conditional_back_insert_iterator: public std::back_insert_iterator<Container> {
+public:
+  template<typename P>
+  conditional_back_insert_iterator(Container& c, P&& p): std::back_insert_iterator<Container>(c), p(std::forward<P>(p)) {}
+
+  // MSVC seems to insist on having copy constructor and assignment
+  conditional_back_insert_iterator(const conditional_back_insert_iterator& other):
+    std::back_insert_iterator<Container>(other), p(other.p) {}
+  conditional_back_insert_iterator& operator=(const conditional_back_insert_iterator& other) {
+    std::back_insert_iterator<Container>::operator=(other);
+    p = other.p;
+    return *this;
+  }
+
+  conditional_back_insert_iterator& operator=(typename Container::const_reference value) {
+    if (p(value)) std::back_insert_iterator<Container>::operator=(value);
+    return *this;
+  }
+
+  conditional_back_insert_iterator& operator*() { return *this; }
+  conditional_back_insert_iterator& operator++() { return *this; }
+  conditional_back_insert_iterator& operator++(int) { return *this; }
+
+private:
+  Predicate p;
+};
+
+template<typename Container, typename Predicate>
+conditional_back_insert_iterator<Container, Predicate> conditional_back_inserter(Container& c, Predicate&& p) {
+  return conditional_back_insert_iterator<Container, Predicate>(c, std::forward<Predicate>(p));
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
index 5287887..db66ac7 100644
--- a/theta/include/theta_a_not_b.hpp
+++ b/theta/include/theta_a_not_b.hpp
@@ -54,6 +54,13 @@
   typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
   uint16_t seed_hash_;
 
+  class less_than {
+  public:
+    explicit less_than(uint64_t value): value(value) {}
+    bool operator()(uint64_t value) const { return value < this->value; }
+  private:
+    uint64_t value;
+  };
 };
 
 // alias with default allocator for convenience
diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp
index cc171ce..4343ee3 100644
--- a/theta/include/theta_a_not_b_impl.hpp
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -22,6 +22,8 @@
 
 #include <algorithm>
 
+#include "conditional_back_inserter.hpp"
+
 namespace datasketches {
 
 /*
@@ -37,61 +39,42 @@
 
 template<typename A>
 compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
-  if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+  if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
   if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
   if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
-  if (a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
 
   const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
   vector_u64<A> keys;
-  uint32_t keys_size = 0;
-  uint32_t count = 0;
   bool is_empty = a.is_empty();
 
   if (b.get_num_retained() == 0) {
-    for (auto key: a) if (key < theta) ++count;
-    keys.resize(count);
-    std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; });
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-    if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-    return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
-  }
-
-  keys_size = a.get_num_retained();
-  keys.resize(keys_size);
-
-  if (a.is_ordered() && b.is_ordered()) { // sort-based
-    const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin());
-    count = end - keys.begin();
-  } else { // hash-based
-    const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
-    vector_u64<A> b_hash_table(1 << lg_size, 0);
-    for (auto key: b) {
-      if (key < theta) {
-        update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
-      } else if (b.is_ordered()) {
-        break; // early stop
+    std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
+  } else {
+    if (a.is_ordered() && b.is_ordered()) { // sort-based
+      std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
+    } else { // hash-based
+      const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+      vector_u64<A> b_hash_table(1 << lg_size, 0);
+      for (auto key: b) {
+        if (key < theta) {
+          update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
+        } else if (b.is_ordered()) {
+          break; // early stop
+        }
       }
-    }
 
-    // scan A lookup B
-    for (auto key: a) {
-      if (key < theta) {
-        if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key;
-      } else if (a.is_ordered()) {
-        break; // early stop
+      // scan A lookup B
+      for (auto key: a) {
+        if (key < theta) {
+          if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
+        } else if (a.is_ordered()) {
+          break; // early stop
+        }
       }
     }
   }
-
-  if (count == 0) {
-    keys.resize(0);
-    if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
-  } else if (count < keys_size) {
-    keys.resize(count);
-    if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
-  }
-
+  if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+  if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
   return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
 }
 
diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp
index 79fea4e..6be6757 100644
--- a/theta/include/theta_intersection_impl.hpp
+++ b/theta/include/theta_intersection_impl.hpp
@@ -44,7 +44,7 @@
 template<typename A>
 void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
   if (is_empty_) return;
-  if (sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
+  if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
   is_empty_ |= sketch.is_empty();
   theta_ = std::min(theta_, sketch.get_theta64());
   if (is_valid_ && num_keys_ == 0) return;
diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp
index 417dfa1..0514884 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -101,9 +101,9 @@
   is.read((char*)&seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -114,6 +114,8 @@
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
@@ -146,9 +148,9 @@
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
 
   check_serial_version(serial_version, SERIAL_VERSION);
-  check_seed_hash(seed_hash, get_seed_hash(seed));
 
   if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    check_seed_hash(seed_hash, get_seed_hash(seed));
     typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
     typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
     return unique_ptr(
@@ -161,6 +163,8 @@
       }
     );
   } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
+    const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+    if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
     typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
     return unique_ptr(
       static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
@@ -753,7 +757,8 @@
   is.read((char*)&seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
 }
 
@@ -801,7 +806,8 @@
   ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
   theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
   theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
-  theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
+  const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+  if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
   return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
 }
 
diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp
index 54bd3fa..1ef5255 100644
--- a/theta/test/theta_a_not_b_test.cpp
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -217,4 +217,28 @@
   REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument);
 }
 
+TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") {
+  update_theta_sketch a = update_theta_sketch::builder().build();
+  int value = 0;
+  for (int i = 0; i < 10000; i++) a.update(value++);
+
+  update_theta_sketch b = update_theta_sketch::builder().build();
+  value = 5000;
+  for (int i = 0; i < 25000; i++) b.update(value++);
+
+  theta_a_not_b a_not_b;
+
+  // unordered inputs
+  compact_theta_sketch result = a_not_b.compute(a, b);
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+
+  // ordered inputs
+  result = a_not_b.compute(a.compact(), b.compact());
+  REQUIRE_FALSE(result.is_empty());
+  REQUIRE(result.is_estimation_mode());
+  REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03));
+}
+
 } /* namespace datasketches */
diff --git a/theta/test/theta_compact_empty_from_java.sk b/theta/test/theta_compact_empty_from_java.sk
index 44730d3..f6c647f 100644
--- a/theta/test/theta_compact_empty_from_java.sk
+++ b/theta/test/theta_compact_empty_from_java.sk
Binary files differ