implemented filter
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 383e573..cbfd9f1 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -381,6 +381,15 @@
*/
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
+ /**
+ * Produces a Compact Tuple sketch from this sketch
+ * by applying a given predicate to each entry.
+ * @param predicate should return true for the entries to keep
+ * @return compact sketch with the entries retained according to the predicate
+ */
+ template<typename Predicate>
+ compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
+
virtual iterator begin();
virtual iterator end();
virtual const_iterator begin() const;
@@ -481,6 +490,25 @@
virtual uint16_t get_seed_hash() const;
/**
+ * Produces a Compact Tuple sketch from this sketch
+ * by applying a given predicate to each entry.
+ * @param predicate should return true for the entries to keep
+ * @return compact sketch with the entries retained according to the predicate
+ */
+ template<typename Predicate>
+ compact_tuple_sketch filter(const Predicate& predicate) const;
+
+ /**
+ * Produces a Compact Tuple sketch from a given sketch (Update or Compact)
+ * by applying a given predicate to each entry.
+ * @param sketch input sketch
+ * @param predicate should return true for the entries to keep
+ * @return compact sketch with the entries retained according to the predicate
+ */
+ template<typename Sketch, typename Predicate>
+ static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
+
+ /**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* @param sd instance of a SerDe
@@ -579,7 +607,6 @@
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
-
};
/// Tuple base builder
diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp
index 0766e4d..e5bf834 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -259,6 +259,12 @@
}
template<typename S, typename U, typename P, typename A>
+template<typename Predicate>
+compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
+ return compact_tuple_sketch<S, A>::filter(*this, predicate);
+}
+
+template<typename S, typename U, typename P, typename A>
void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
os << " lg current size : " << (int) map_.lg_cur_size_ << std::endl;
@@ -344,6 +350,33 @@
return seed_hash_;
}
+template<typename S, typename A>
+template<typename Predicate>
+compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
+ return filter(*this, predicate);
+}
+
+template<typename S, typename A>
+template<typename Sketch, typename Predicate>
+compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
+ std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
+ entries.reserve(sketch.get_num_retained());
+ std::copy_if(
+ sketch.begin(),
+ sketch.end(),
+ std::back_inserter(entries),
+ [&predicate](const Entry& e) {return predicate(e.second);}
+ );
+ entries.shrink_to_fit();
+ return compact_tuple_sketch(
+ !sketch.is_estimation_mode() && entries.empty(),
+ sketch.is_ordered(),
+ sketch.get_seed_hash(),
+ sketch.get_theta64(),
+ std::move(entries)
+ );
+}
+
// implementation for fixed-size arithmetic types (integral and floating point)
template<typename S, typename A>
template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index b5316e8..6e44e28 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -310,4 +310,65 @@
REQUIRE(sketch.get_num_retained() == 3);
}
+TEST_CASE("filter", "[tuple_sketch]") {
+ auto usk = update_tuple_sketch<int>::builder().build();
+
+ { // empty update sketch
+ auto sk = usk.filter([](int){return true;});
+ REQUIRE(sk.is_empty());
+ REQUIRE(sk.is_ordered());
+ REQUIRE(sk.get_num_retained() == 0);
+ }
+
+ { // empty compact sketch
+ auto sk = usk.compact().filter([](int){return true;});
+ REQUIRE(sk.is_empty());
+ REQUIRE(sk.is_ordered());
+ REQUIRE(sk.get_num_retained() == 0);
+ }
+
+ usk.update(1, 1);
+ usk.update(1, 1);
+ usk.update(2, 1);
+ usk.update(2, 1);
+ usk.update(3, 1);
+
+ { // exact mode update sketch
+ auto sk = usk.filter([](int v){return v > 1;});
+ REQUIRE_FALSE(sk.is_empty());
+ REQUIRE_FALSE(sk.is_ordered());
+ REQUIRE_FALSE(sk.is_estimation_mode());
+ REQUIRE(sk.get_num_retained() == 2);
+ }
+
+ { // exact mode compact sketch
+ auto sk = usk.compact().filter([](int v){return v > 1;});
+ REQUIRE_FALSE(sk.is_empty());
+ REQUIRE(sk.is_ordered());
+ REQUIRE_FALSE(sk.is_estimation_mode());
+ REQUIRE(sk.get_num_retained() == 2);
+ }
+
+ // only keys 1 and 2 had values of 2, which will become 3 after this update
+ // some entries are discarded in estimation mode, but these happen to survive
+ // the process is deterministic, so the test will always work
+ for (int i = 0; i < 10000; ++i) usk.update(i, 1);
+
+ { // estimation mode update sketch
+ auto sk = usk.filter([](int v){return v > 2;});
+ REQUIRE_FALSE(sk.is_empty());
+ REQUIRE_FALSE(sk.is_ordered());
+ REQUIRE(sk.is_estimation_mode());
+ REQUIRE(sk.get_num_retained() == 2);
+ }
+
+ { // estimation mode compact sketch
+ auto sk = usk.compact().filter([](int v){return v > 2;});
+ REQUIRE_FALSE(sk.is_empty());
+ REQUIRE(sk.is_ordered());
+ REQUIRE(sk.is_estimation_mode());
+ REQUIRE(sk.get_num_retained() == 2);
+ }
+}
+
} /* namespace datasketches */