Add initial bloom filter Python bindings and tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e74346..ddc9325 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,7 @@
src/count_wrapper.cpp
src/tdigest_wrapper.cpp
src/vector_of_kll.cpp
+ src/bloom_filter_wrapper.cpp
src/py_serde.cpp
)
diff --git a/_datasketches.so b/_datasketches.so
new file mode 100755
index 0000000..3bbdc3b
--- /dev/null
+++ b/_datasketches.so
Binary files differ
diff --git a/src/bloom_filter_wrapper.cpp b/src/bloom_filter_wrapper.cpp
new file mode 100644
index 0000000..abcdc0a
--- /dev/null
+++ b/src/bloom_filter_wrapper.cpp
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+
+#include "bloom_filter.hpp"
+#include "common_defs.hpp"
+
+namespace nb = nanobind;
+
+template<typename A>
+void bind_bloom_filter(nb::module_ &m, const char* name) {
+ using namespace datasketches;
+ using bloom_filter_type = bloom_filter_alloc<A>;
+
+ // Start with just one simple function
+ m.def("create_bloom_filter",
+ [](uint64_t max_distinct_items, double target_false_positive_prob) {
+ return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob);
+ },
+ nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
+ "Creates a Bloom filter with optimal parameters for the given accuracy requirements");
+
+ // Bind the class with minimal methods
+ nb::class_<bloom_filter_type>(m, name)
+ .def("is_empty", &bloom_filter_type::is_empty,
+ "Returns True if the filter has seen no items, otherwise False")
+ .def("update", static_cast<void (bloom_filter_type::*)(const std::string&)>(&bloom_filter_type::update),
+ nb::arg("item"),
+ "Updates the filter with the given string")
+ .def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query),
+ nb::arg("item"),
+ "Queries the filter for the given string");
+}
+
+void init_bloom_filter(nb::module_ &m) {
+ bind_bloom_filter<std::allocator<uint8_t>>(m, "bloom_filter");
+}
\ No newline at end of file
diff --git a/src/datasketches.cpp b/src/datasketches.cpp
index 118683b..4aec928 100644
--- a/src/datasketches.cpp
+++ b/src/datasketches.cpp
@@ -41,6 +41,7 @@
void init_density(nb::module_& m);
void init_tdigest(nb::module_& m);
void init_vector_of_kll(nb::module_& m);
+void init_bloom_filter(nb::module_& m);
// supporting objects
void init_kolmogorov_smirnov(nb::module_& m);
@@ -73,6 +74,7 @@
init_density(m);
init_tdigest(m);
init_vector_of_kll(m);
+ init_bloom_filter(m);
init_kolmogorov_smirnov(m);
init_serde(m);
diff --git a/tests/bloom_filter_test.py b/tests/bloom_filter_test.py
new file mode 100644
index 0000000..a7ba806
--- /dev/null
+++ b/tests/bloom_filter_test.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import create_bloom_filter
+
+class BloomFilterTest(unittest.TestCase):
+ def test_create_bloom_filter(self):
+ """Test that we can create a bloom filter with basic parameters"""
+ bf = create_bloom_filter(1000, 0.01)
+ self.assertIsNotNone(bf)
+ self.assertTrue(bf.is_empty())
+
+ def test_bloom_filter_empty_state(self):
+ """Test that newly created bloom filter is empty"""
+ bf = create_bloom_filter(100, 0.05)
+ self.assertTrue(bf.is_empty())
+
+ def test_bloom_filter_update_and_query(self):
+ """Test basic update and query functionality"""
+ bf = create_bloom_filter(1000, 0.01)
+
+ # Initially empty
+ self.assertTrue(bf.is_empty())
+ self.assertFalse(bf.query("test_item"))
+
+ # Add an item
+ bf.update("test_item")
+ self.assertFalse(bf.is_empty())
+ self.assertTrue(bf.query("test_item"))
+
+ # Query for item not in filter
+ self.assertFalse(bf.query("other_item"))
+
+ def test_bloom_filter_multiple_items(self):
+ """Test adding multiple items to the bloom filter"""
+ bf = create_bloom_filter(1000, 0.01)
+
+ items = ["item1", "item2", "item3", "item4", "item5"]
+
+ # Add all items
+ for item in items:
+ bf.update(item)
+
+ # Check that all items are found
+ for item in items:
+ self.assertTrue(bf.query(item), f"Item {item} should be found")
+
+ # Check that items not added are not found
+ non_items = ["not_item1", "not_item2", "not_item3"]
+ for item in non_items:
+ self.assertFalse(bf.query(item), f"Item {item} should not be found")
+
+ def test_bloom_filter_false_positives(self):
+ """Test that bloom filter can have false positives (this is expected behavior)"""
+ bf = create_bloom_filter(10, 0.1) # Small filter, higher false positive rate
+
+ # Add a few items
+ bf.update("item1")
+ bf.update("item2")
+
+ # Check that added items are found
+ self.assertTrue(bf.query("item1"))
+ self.assertTrue(bf.query("item2"))
+
+ # With a small filter and high false positive rate, we might get false positives
+ # This is expected behavior for bloom filters
+ # We're not testing for specific false positives, just that the filter works
+
+ def test_bloom_filter_parameters(self):
+ """Test creating bloom filters with different parameters"""
+ # Test with different sizes and false positive rates
+ test_cases = [
+ (100, 0.01),
+ (1000, 0.05),
+ (10000, 0.001),
+ (100, 0.1),
+ ]
+
+ for max_items, false_positive_rate in test_cases:
+ with self.subTest(max_items=max_items, false_positive_rate=false_positive_rate):
+ bf = create_bloom_filter(max_items, false_positive_rate)
+ self.assertIsNotNone(bf)
+ self.assertTrue(bf.is_empty())
+
+ def test_bloom_filter_string_types(self):
+ """Test that bloom filter works with different string types"""
+ bf = create_bloom_filter(1000, 0.01)
+
+ # Test with different string types
+ test_strings = [
+ "simple",
+ "string with spaces",
+ "string_with_underscores",
+ "string-with-dashes",
+ "string123with456numbers",
+ "string.with.dots",
+ "string!with@special#chars$",
+ ]
+
+ for test_string in test_strings:
+ with self.subTest(test_string=test_string):
+ bf.update(test_string)
+ self.assertTrue(bf.query(test_string))
+
+ # Test empty string separately - it might be ignored by the implementation
+ bf.update("")
+ # Note: Empty strings might be ignored by the bloom filter implementation
+ # This is common behavior, so we don't assert on the result
+
+ def test_bloom_filter_edge_cases(self):
+ """Test edge cases for bloom filter"""
+ bf = create_bloom_filter(1000, 0.01)
+
+ # Test with very long strings
+ long_string = "a" * 1000
+ bf.update(long_string)
+ self.assertTrue(bf.query(long_string))
+
+ # Test with unicode strings
+ unicode_string = "café résumé naïve"
+ bf.update(unicode_string)
+ self.assertTrue(bf.query(unicode_string))
+
+ # Test with numbers as strings
+ number_string = "12345"
+ bf.update(number_string)
+ self.assertTrue(bf.query(number_string))
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file