Add initial bloom filter Python bindings and tests

commit: 65e1f1d8fb5a65f073ffb21b7e8eebf7c5d0a287 [log] [tgz]
author: c-dickens <dickens.c07@gmail.com> Fri Aug 08 22:30:43 2025 +0100
committer: c-dickens <dickens.c07@gmail.com> Fri Aug 08 22:30:43 2025 +0100
tree: 6aed33818a1cca1fd7106a797a1a9b23a512abd3
parent: 66d1d52a654cf628338d982af5ee62ade813278d [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e74346..ddc9325 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -110,6 +110,7 @@
     src/count_wrapper.cpp
     src/tdigest_wrapper.cpp
     src/vector_of_kll.cpp
+    src/bloom_filter_wrapper.cpp
     src/py_serde.cpp
 )
 

diff --git a/_datasketches.so b/_datasketches.so
new file mode 100755
index 0000000..3bbdc3b
--- /dev/null
+++ b/_datasketches.so
Binary files differ

diff --git a/src/bloom_filter_wrapper.cpp b/src/bloom_filter_wrapper.cpp
new file mode 100644
index 0000000..abcdc0a
--- /dev/null
+++ b/src/bloom_filter_wrapper.cpp

@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+
+#include "bloom_filter.hpp"
+#include "common_defs.hpp"
+
+namespace nb = nanobind;
+
+template<typename A>
+void bind_bloom_filter(nb::module_ &m, const char* name) {
+  using namespace datasketches;
+  using bloom_filter_type = bloom_filter_alloc<A>;
+
+  // Start with just one simple function
+  m.def("create_bloom_filter", 
+         [](uint64_t max_distinct_items, double target_false_positive_prob) {
+           return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob);
+         },
+         nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
+         "Creates a Bloom filter with optimal parameters for the given accuracy requirements");
+
+  // Bind the class with minimal methods
+  nb::class_<bloom_filter_type>(m, name)
+    .def("is_empty", &bloom_filter_type::is_empty,
+         "Returns True if the filter has seen no items, otherwise False")
+    .def("update", static_cast<void (bloom_filter_type::*)(const std::string&)>(&bloom_filter_type::update), 
+         nb::arg("item"),
+         "Updates the filter with the given string")
+    .def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query), 
+         nb::arg("item"),
+         "Queries the filter for the given string");
+}
+
+void init_bloom_filter(nb::module_ &m) {
+  bind_bloom_filter<std::allocator<uint8_t>>(m, "bloom_filter");
+} 
\ No newline at end of file

diff --git a/src/datasketches.cpp b/src/datasketches.cpp
index 118683b..4aec928 100644
--- a/src/datasketches.cpp
+++ b/src/datasketches.cpp

@@ -41,6 +41,7 @@
 void init_density(nb::module_& m);
 void init_tdigest(nb::module_& m);
 void init_vector_of_kll(nb::module_& m);
+void init_bloom_filter(nb::module_& m);
 
 // supporting objects
 void init_kolmogorov_smirnov(nb::module_& m);
@@ -73,6 +74,7 @@
   init_density(m);
   init_tdigest(m);
   init_vector_of_kll(m);
+  init_bloom_filter(m);
 
   init_kolmogorov_smirnov(m);
   init_serde(m);

diff --git a/tests/bloom_filter_test.py b/tests/bloom_filter_test.py
new file mode 100644
index 0000000..a7ba806
--- /dev/null
+++ b/tests/bloom_filter_test.py

@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+  
+import unittest
+from datasketches import create_bloom_filter
+
+class BloomFilterTest(unittest.TestCase):
+  def test_create_bloom_filter(self):
+    """Test that we can create a bloom filter with basic parameters"""
+    bf = create_bloom_filter(1000, 0.01)
+    self.assertIsNotNone(bf)
+    self.assertTrue(bf.is_empty())
+
+  def test_bloom_filter_empty_state(self):
+    """Test that newly created bloom filter is empty"""
+    bf = create_bloom_filter(100, 0.05)
+    self.assertTrue(bf.is_empty())
+
+  def test_bloom_filter_update_and_query(self):
+    """Test basic update and query functionality"""
+    bf = create_bloom_filter(1000, 0.01)
+    
+    # Initially empty
+    self.assertTrue(bf.is_empty())
+    self.assertFalse(bf.query("test_item"))
+    
+    # Add an item
+    bf.update("test_item")
+    self.assertFalse(bf.is_empty())
+    self.assertTrue(bf.query("test_item"))
+    
+    # Query for item not in filter
+    self.assertFalse(bf.query("other_item"))
+
+  def test_bloom_filter_multiple_items(self):
+    """Test adding multiple items to the bloom filter"""
+    bf = create_bloom_filter(1000, 0.01)
+    
+    items = ["item1", "item2", "item3", "item4", "item5"]
+    
+    # Add all items
+    for item in items:
+      bf.update(item)
+    
+    # Check that all items are found
+    for item in items:
+      self.assertTrue(bf.query(item), f"Item {item} should be found")
+    
+    # Check that items not added are not found
+    non_items = ["not_item1", "not_item2", "not_item3"]
+    for item in non_items:
+      self.assertFalse(bf.query(item), f"Item {item} should not be found")
+
+  def test_bloom_filter_false_positives(self):
+    """Test that bloom filter can have false positives (this is expected behavior)"""
+    bf = create_bloom_filter(10, 0.1)  # Small filter, higher false positive rate
+    
+    # Add a few items
+    bf.update("item1")
+    bf.update("item2")
+    
+    # Check that added items are found
+    self.assertTrue(bf.query("item1"))
+    self.assertTrue(bf.query("item2"))
+    
+    # With a small filter and high false positive rate, we might get false positives
+    # This is expected behavior for bloom filters
+    # We're not testing for specific false positives, just that the filter works
+
+  def test_bloom_filter_parameters(self):
+    """Test creating bloom filters with different parameters"""
+    # Test with different sizes and false positive rates
+    test_cases = [
+      (100, 0.01),
+      (1000, 0.05),
+      (10000, 0.001),
+      (100, 0.1),
+    ]
+    
+    for max_items, false_positive_rate in test_cases:
+      with self.subTest(max_items=max_items, false_positive_rate=false_positive_rate):
+        bf = create_bloom_filter(max_items, false_positive_rate)
+        self.assertIsNotNone(bf)
+        self.assertTrue(bf.is_empty())
+
+  def test_bloom_filter_string_types(self):
+    """Test that bloom filter works with different string types"""
+    bf = create_bloom_filter(1000, 0.01)
+    
+    # Test with different string types
+    test_strings = [
+      "simple",
+      "string with spaces",
+      "string_with_underscores",
+      "string-with-dashes",
+      "string123with456numbers",
+      "string.with.dots",
+      "string!with@special#chars$",
+    ]
+    
+    for test_string in test_strings:
+      with self.subTest(test_string=test_string):
+        bf.update(test_string)
+        self.assertTrue(bf.query(test_string))
+    
+    # Test empty string separately - it might be ignored by the implementation
+    bf.update("")
+    # Note: Empty strings might be ignored by the bloom filter implementation
+    # This is common behavior, so we don't assert on the result
+
+  def test_bloom_filter_edge_cases(self):
+    """Test edge cases for bloom filter"""
+    bf = create_bloom_filter(1000, 0.01)
+    
+    # Test with very long strings
+    long_string = "a" * 1000
+    bf.update(long_string)
+    self.assertTrue(bf.query(long_string))
+    
+    # Test with unicode strings
+    unicode_string = "café résumé naïve"
+    bf.update(unicode_string)
+    self.assertTrue(bf.query(unicode_string))
+    
+    # Test with numbers as strings
+    number_string = "12345"
+    bf.update(number_string)
+    self.assertTrue(bf.query(number_string))
+
+if __name__ == '__main__':
+    unittest.main() 
\ No newline at end of file
commit	65e1f1d8fb5a65f073ffb21b7e8eebf7c5d0a287	[log] [tgz]
author	c-dickens <dickens.c07@gmail.com>	Fri Aug 08 22:30:43 2025 +0100
committer	c-dickens <dickens.c07@gmail.com>	Fri Aug 08 22:30:43 2025 +0100
tree	6aed33818a1cca1fd7106a797a1a9b23a512abd3
parent	66d1d52a654cf628338d982af5ee62ade813278d [diff]