blob: 5507c071bb9b444765978e15c7effe421b9ea963 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <algorithm>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <catch2/catch.hpp>
#include "array_of_strings_sketch.hpp"
namespace datasketches {
TEST_CASE("aos update policy", "[tuple_sketch]") {
default_array_of_strings_update_policy policy;
SECTION("create empty") {
auto values = policy.create();
REQUIRE(values.size() == 0);
}
SECTION("replace array") {
auto values = policy.create();
array_of_strings input(2, "", std::allocator<std::string>());
input[0] = "alpha";
input[1] = "beta";
policy.update(values, input);
REQUIRE(values.size() == 2);
REQUIRE(values[0] == "alpha");
REQUIRE(values[1] == "beta");
input[0] = "changed";
REQUIRE(values[0] == "alpha");
array_of_strings input2(1, "", std::allocator<std::string>());
input2[0] = "gamma";
policy.update(values, input2);
REQUIRE(values.size() == 1);
REQUIRE(values[0] == "gamma");
}
SECTION("nullptr clears") {
array_of_strings values(2, "", std::allocator<std::string>());
values[0] = "one";
values[1] = "two";
policy.update(values, nullptr);
REQUIRE(values.size() == 0);
}
SECTION("pointer input copies") {
auto values = policy.create();
array_of_strings input(2, "", std::allocator<std::string>());
input[0] = "first";
input[1] = "second";
policy.update(values, &input);
REQUIRE(values.size() == 2);
REQUIRE(values[1] == "second");
input[1] = "changed";
REQUIRE(values[1] == "second");
}
}
TEST_CASE("aos sketch update", "[tuple_sketch]") {
auto make_array = [](std::initializer_list<const char*> entries) {
array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
uint8_t i = 0;
for (const auto* entry: entries) array[i++] = entry;
return array;
};
SECTION("same key replaces summary") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
sketch.update(
hash_array_of_strings_key(make_array({"alpha", "beta"})),
make_array({"first"})
);
sketch.update(
hash_array_of_strings_key(make_array({"alpha", "beta"})),
make_array({"second", "third"})
);
REQUIRE(sketch.get_num_retained() == 1);
auto it = sketch.begin();
REQUIRE(it != sketch.end());
REQUIRE(it->second.size() == 2);
REQUIRE(it->second[0] == "second");
REQUIRE(it->second[1] == "third");
}
SECTION("distinct keys retain multiple entries") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
sketch.update(
hash_array_of_strings_key(make_array({"a", "bc"})),
make_array({"one"})
);
sketch.update(
hash_array_of_strings_key(make_array({"ab", "c"})),
make_array({"two"})
);
REQUIRE(sketch.get_num_retained() == 2);
bool saw_one = false;
bool saw_two = false;
for (const auto& entry: sketch) {
REQUIRE(entry.second.size() == 1);
if (entry.second[0] == "one") saw_one = true;
if (entry.second[0] == "two") saw_two = true;
}
REQUIRE(saw_one);
REQUIRE(saw_two);
}
SECTION("empty key") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"}));
REQUIRE(sketch.get_num_retained() == 1);
auto it = sketch.begin();
REQUIRE(it != sketch.end());
REQUIRE(it->second.size() == 1);
REQUIRE(it->second[0] == "value");
}
}
TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") {
auto make_array = [](std::initializer_list<std::string> entries) {
array_of_strings array(static_cast<uint8_t>(entries.size()), "", std::allocator<std::string>());
uint8_t i = 0;
for (const auto& entry: entries) array[i++] = entry;
return array;
};
auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
typedef std::pair<uint64_t, array_of_strings> entry_type;
std::vector<entry_type> entries;
for (const auto& entry: sketch) entries.push_back(entry);
struct entry_less {
bool operator()(const entry_type& lhs, const entry_type& rhs) const {
return lhs.first < rhs.first;
}
};
std::sort(entries.begin(), entries.end(), entry_less());
return entries;
};
auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) {
std::stringstream ss;
ss.exceptions(std::ios::failbit | std::ios::badbit);
compact_sketch.serialize(ss, default_array_of_strings_serde<>());
auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(
ss, DEFAULT_SEED, default_array_of_strings_serde<>()
);
auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>());
auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = {
&deserialized_stream,
&deserialized_bytes
};
for (int list_index = 0; list_index < 2; ++list_index) {
const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index];
REQUIRE(compact_sketch.is_empty() == deserialized->is_empty());
REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode());
REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered());
REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained());
REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10));
REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10));
REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10));
REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10));
auto original_entries = collect_entries(compact_sketch);
auto round_trip_entries = collect_entries(*deserialized);
REQUIRE(original_entries.size() == round_trip_entries.size());
for (size_t i = 0; i < original_entries.size(); ++i) {
REQUIRE(original_entries[i].first == round_trip_entries[i].first);
REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size());
for (size_t j = 0; j < original_entries[i].second.size(); ++j) {
REQUIRE(original_entries[i].second[static_cast<uint8_t>(j)] ==
round_trip_entries[i].second[static_cast<uint8_t>(j)]);
}
}
}
};
auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) {
auto ordered = compact_array_of_strings_sketch(sketch, true);
auto unordered = compact_array_of_strings_sketch(sketch, false);
check_round_trip(ordered);
check_round_trip(unordered);
};
SECTION("empty sketch") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
run_tests(sketch);
}
SECTION("single entry sketch") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"}));
run_tests(sketch);
}
SECTION("multiple entries exact mode") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build();
for (int i = 0; i < 50; ++i) {
sketch.update(
hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})),
make_array({std::string("value-") + std::to_string(i), "extra"})
);
}
REQUIRE_FALSE(sketch.is_estimation_mode());
run_tests(sketch);
}
SECTION("multiple entries estimation mode") {
auto sketch = update_array_of_strings_tuple_sketch<>::builder().build();
for (int i = 0; i < 10000; ++i) {
sketch.update(
hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})),
make_array({std::string("value-") + std::to_string(i)})
);
}
REQUIRE(sketch.is_estimation_mode());
run_tests(sketch);
}
}
TEST_CASE("aos serde validation", "[tuple_sketch]") {
default_array_of_strings_serde<> serde;
SECTION("too many nodes rejected") {
array_of_strings array(128, "", std::allocator<std::string>());
std::stringstream ss;
ss.exceptions(std::ios::failbit | std::ios::badbit);
REQUIRE_THROWS_WITH(
serde.serialize(ss, &array, 1),
Catch::Matchers::Contains("size exceeds 127")
);
}
}
} /* namespace datasketches */