extracted constants, added doc
diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index 5e92873..d354850 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -52,6 +52,8 @@
list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
list(APPEND tuple_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
list(APPEND tuple_HEADERS "include/jaccard_similarity.hpp")
+list(APPEND tuple_HEADERS "include/theta_comparators.hpp")
+list(APPEND tuple_HEADERS "include/theta_cnstants.hpp")
install(TARGETS tuple
EXPORT ${PROJECT_NAME}
@@ -97,4 +99,6 @@
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
${CMAKE_CURRENT_SOURCE_DIR}/include/jaccard_similarity.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
)
diff --git a/tuple/include/jaccard_similarity.hpp b/tuple/include/jaccard_similarity.hpp
index cf11fa5..bea700d 100644
--- a/tuple/include/jaccard_similarity.hpp
+++ b/tuple/include/jaccard_similarity.hpp
@@ -36,6 +36,22 @@
template<typename Union, typename Intersection, typename ExtractKey>
class jaccard_similarity_base {
public:
+
+ /**
+ * Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index
+ * <i>J(A,B) = (A ^ B)/(A U B)</i> is used to measure how similar the two sketches are to each
+ * other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are
+ * disjoint. A Jaccard of .95 means the overlap between the two
+ * sets is 95% of the union of the two sets.
+ *
+ * <p>Note: For very large pairs of sketches, where the configured nominal entries of the sketches
+ * are 2^25 or 2^26, this method may produce unpredictable results.
+ *
+ * @param sketch_a given sketch A
+ * @param sketch_b given sketch B
+ * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
+ * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
+ */
template<typename SketchA, typename SketchB>
static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
if (&sketch_a == &sketch_b) return {1, 1, 1};
@@ -45,7 +61,7 @@
// union
const unsigned count_a = sketch_a.get_num_retained();
const unsigned count_b = sketch_b.get_num_retained();
- const unsigned lg_k = std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K);
+ const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
auto u = typename Union::builder().set_lg_k(lg_k).build();
u.update(sketch_a);
u.update(sketch_b);
diff --git a/tuple/include/theta_constants.hpp b/tuple/include/theta_constants.hpp
new file mode 100644
index 0000000..989681f
--- /dev/null
+++ b/tuple/include/theta_constants.hpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_CONSTANTS_HPP_
+#define THETA_CONSTANTS_HPP_
+
+namespace datasketches {
+
+namespace theta_constants {
+ enum resize_factor { X1, X2, X4, X8 };
+ static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
+ static const uint8_t MIN_LG_K = 5;
+ static const uint8_t MAX_LG_K = 26;
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/tuple/include/theta_update_sketch_base.hpp b/tuple/include/theta_update_sketch_base.hpp
index 7318b91..f81a2ba 100644
--- a/tuple/include/theta_update_sketch_base.hpp
+++ b/tuple/include/theta_update_sketch_base.hpp
@@ -27,15 +27,10 @@
#include "common_defs.hpp"
#include "MurmurHash3.h"
#include "theta_comparators.hpp"
+#include "theta_constants.hpp"
namespace datasketches {
-namespace theta_constants {
- enum resize_factor { X1, X2, X4, X8 };
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
- static const uint8_t MIN_LG_K = 5;
-}
-
template<
typename Entry,
typename ExtractKey,