blob: 3250901d9ec7f619d4f88d948ba886a4c2cf5836 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch.hpp>
#include <sstream>
#include "hll.hpp"
namespace datasketches {
static int min(int a, int b) {
return (a < b) ? a : b;
}
static void println(std::string& str) {
//std::cout << str << "\n";
}
static void basicUnion(uint64_t n1, uint64_t n2,
uint64_t lgk1, uint64_t lgk2, uint64_t lgMaxK,
target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
uint64_t v = 0;
//int tot = n1 + n2;
hll_sketch h1(lgk1, type1);
hll_sketch h2(lgk2, type2);
int lgControlK = min(min(lgk1, lgk2), lgMaxK);
hll_sketch control(lgControlK, resultType);
for (uint64_t i = 0; i < n1; ++i) {
h1.update(v + i);
control.update(v + i);
}
v += n1;
for (uint64_t i = 0; i < n2; ++i) {
h2.update(v + i);
control.update(v + i);
}
v += n2;
hll_union u(lgMaxK);
u.update(std::move(h1));
u.update(h2);
hll_sketch result = u.get_result(resultType);
// force non-HIP estimates to avoid issues with in- vs out-of-order
double uEst = result.get_composite_estimate();
double uUb = result.get_upper_bound(2);
double uLb = result.get_lower_bound(2);
//double rerr = ((uEst/tot) - 1.0) * 100;
double controlEst = control.get_composite_estimate();
double controlUb = control.get_upper_bound(2);
double controlLb = control.get_lower_bound(2);
REQUIRE((controlUb - controlEst) >= 0.0);
REQUIRE((uUb - uEst) >= 0.0);
REQUIRE((controlEst - controlLb) >= 0.0);
REQUIRE((uEst - uLb) >= 0.0);
REQUIRE(controlEst == uEst);
}
/**
* The task here is to check the transition boundaries as the sketch morphs between LIST to
* SET to HLL modes. The transition points vary as a function of lgConfigK. In addition,
* this checks that the union operation is operating properly based on the order the
* sketches are presented to the union.
*/
TEST_CASE("hll union: check unions", "[hll_union]") {
target_hll_type type1 = HLL_8;
target_hll_type type2 = HLL_8;
target_hll_type resultType = HLL_8;
uint64_t lgK1 = 7;
uint64_t lgK2 = 7;
uint64_t lgMaxK = 7;
uint64_t n1 = 7;
uint64_t n2 = 7;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 = 8;
n2 = 7;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 = 7;
n2 = 8;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 = 8;
n2 = 8;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 = 7;
n2 = 14;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
int i = 0;
for (i = 7; i <= 13; ++i) {
lgK1 = i;
lgK2 = i;
lgMaxK = i;
{
n1 = ((1 << (i - 3)) * 3)/4; // compute the transition point
n2 = n1;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 -= 2;
n2 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
}
lgK1 = i;
lgK2 = i + 1;
lgMaxK = i;
{
n1 = ((1 << (i - 3)) * 3)/4;
n2 = n1;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 -= 2;
n2 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
}
lgK1 = i + 1;
lgK2 = i;
lgMaxK = i;
{
n1 = ((1 << (i - 3)) * 3)/4;
n2 = n1;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 -= 2;
n2 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
}
lgK1 = i + 1;
lgK2 = i + 1;
lgMaxK = i;
{
n1 = ((1 << (i - 3)) * 3)/4;
n2 = n1;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 -= 2;
n2 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
n1 += 2;
basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
}
}
}
TEST_CASE("hll union: check composite estimate", "[hll_union]") {
hll_union u(12);
REQUIRE(u.is_empty());
REQUIRE(u.get_composite_estimate() == Approx(0.0).margin(0.03));
for (int i = 1; i <= 15; ++i) { u.update(i); }
REQUIRE(u.get_composite_estimate() == Approx(15.0).margin(15 * 0.03));
for (int i = 16; i <= 1000; ++i) { u.update(i); }
REQUIRE(u.get_composite_estimate() == Approx(1000.0).margin(1000 * 0.03));
}
TEST_CASE("hll union: check config k limits", "[hll_union]") {
REQUIRE_THROWS_AS(hll_union(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
REQUIRE_THROWS_AS(hll_union(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
}
static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
double re = RelativeErrorTables<>::getRelErr(ub, oooFlag, lgK, numStdDev);
return est / (1.0 + re);
}
TEST_CASE("hll union: check ub lb", "[hll_union]") {
int lgK = 4;
int n = 1 << 20;
bool oooFlag = false;
double bound;
std::string str;
bound = (getBound(lgK, true, oooFlag, 3, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", UB3: " + std::to_string(bound);
println(str);
bound = (getBound(lgK, true, oooFlag, 2, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", UB2: " + std::to_string(bound);
println(str);
bound = (getBound(lgK, true, oooFlag, 1, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", UB1: " + std::to_string(bound);
println(str);
bound = (getBound(lgK, false, oooFlag, 1, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", LB1: " + std::to_string(bound);
println(str);
bound = (getBound(lgK, false, oooFlag, 2, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", LB2: " + std::to_string(bound);
println(str);
bound = (getBound(lgK, false, oooFlag, 3, n) / n) - 1;
str = "LgK=" + std::to_string(lgK) + ", LB3: " + std::to_string(bound);
println(str);
}
TEST_CASE("hll union: check conversions", "[hll_union]") {
int lgK = 4;
hll_sketch sk1(lgK, HLL_8);
hll_sketch sk2(lgK, HLL_8);
int n = 1 << 20;
for (int i = 0; i < n; ++i) {
sk1.update(i);
sk2.update(i + n);
}
hll_union hllUnion(lgK);
hllUnion.update(sk1);
hllUnion.update(sk2);
hll_sketch rsk1 = hllUnion.get_result(HLL_4);
hll_sketch rsk2 = hllUnion.get_result(HLL_6);
hll_sketch rsk3 = hllUnion.get_result(HLL_8);
double est1 = rsk1.get_estimate();
double est2 = rsk2.get_estimate();
double est3 = rsk3.get_estimate();
REQUIRE(est1 == est2);
REQUIRE(est1 == est3);
}
TEST_CASE("hll union: check input types", "[hll_union]") {
hll_union u(8);
// inserting the same value as a variety of input types
u.update((uint8_t) 102);
u.update((uint16_t) 102);
u.update((uint32_t) 102);
u.update((uint64_t) 102);
u.update((int8_t) 102);
u.update((int16_t) 102);
u.update((int32_t) 102);
u.update((int64_t) 102);
REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
// identical binary representations
// no unsigned in Java, but need to sign-extend both as Java would do
u.update((uint8_t) 255);
u.update((int8_t) -1);
u.update((float) -2.0);
u.update((double) -2.0);
std::string str = "input string";
u.update(str);
u.update(str.c_str(), str.length());
REQUIRE(u.get_estimate() == Approx(4.0).margin(0.01));
u = hll_union(8);
u.update((float) 0.0);
u.update((float) -0.0);
u.update((double) 0.0);
u.update((double) -0.0);
REQUIRE(u.get_estimate() == Approx(1.0).margin(0.01));
u = hll_union(8);
u.update(std::nanf("3"));
u.update(std::nan("12"));
REQUIRE(1.0 == Approx(u.get_estimate()).margin(0.01));
REQUIRE(u.get_result().get_estimate() == Approx(u.get_estimate()).margin(0.01));
u = hll_union(8);
u.update(nullptr, 0);
u.update("");
REQUIRE(u.is_empty());
}
static void union_two_sketches_with_overlap(int num, uint8_t lg_k, target_hll_type type) {
hll_sketch sketch1(lg_k, type);
for (int key = 0; key < num; key++) sketch1.update(key);
const int overlap = num / 2;
hll_sketch sketch2(lg_k, type);
for (int key = overlap; key < num + overlap; key++) sketch2.update(key);
hll_union u(lg_k);
u.update(sketch1);
u.update(sketch2);
hll_sketch sketch = u.get_result(type);
REQUIRE(sketch.get_estimate() == Approx(num * 1.5).margin(num * 1.5 * 0.02));
}
TEST_CASE("hll union: check hll to hll", "[hll_union]") {
union_two_sketches_with_overlap(1000000, 11, HLL_4);
}
} /* namespace datasketches */