blob: af37d6c2bb7353e9fa75fa1ccab27fff3ec4838c [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch2/catch.hpp>
#include <fstream>
#include <vector>
#include "array_of_strings_sketch.hpp"
namespace datasketches {
// assume the binary sketches for this test have been generated by datasketches-java code
// in the subdirectory called "java" in the root directory of this project
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
static std::vector<uint8_t> read_binary_file(const std::string& path) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
is.seekg(0, std::ios::end);
const auto size = static_cast<size_t>(is.tellg());
is.seekg(0, std::ios::beg);
std::vector<uint8_t> bytes(size);
if (size != 0) {
is.read(reinterpret_cast<char*>(bytes.data()), size);
}
return bytes;
}
TEST_CASE("aos sketch one value", "[serde_compat]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk";
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 1);
}
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 1);
}
}
}
}
TEST_CASE("aos sketch three values", "[serde_compat]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk";
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 3);
}
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 3);
}
}
}
}
TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") {
const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk";
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_num_retained() == 0);
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_num_retained() == 0);
}
}
TEST_CASE("aos sketch multi keys strings", "[serde_compat]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk";
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 1);
}
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 1);
}
}
}
}
TEST_CASE("aos sketch unicode strings", "[serde_compat]") {
const auto path = testBinaryInputPath + "aos_unicode_java.sk";
auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
REQUIRE_FALSE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_num_retained() == 3);
const std::vector<std::vector<std::string>> expected_values = {
{"밸류", "값"},
{"📦", "🎁"},
{"ценить1", "ценить2"}
};
std::vector<bool> matched(expected_values.size(), false);
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
REQUIRE(entry.second.size() == 2);
bool found = false;
for (size_t i = 0; i < expected_values.size(); ++i) {
if (matched[i]) continue;
const auto& expected = expected_values[i];
if (entry.second.size() != expected.size()) continue;
bool equal = true;
for (size_t j = 0; j < expected.size(); ++j) {
if (entry.second[j] != expected[j]) {
equal = false;
break;
}
}
if (equal) {
matched[i] = true;
found = true;
break;
}
}
REQUIRE(found);
}
for (bool found: matched) REQUIRE(found);
};
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
check(sketch);
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
check(sketch);
}
}
TEST_CASE("aos sketch empty strings", "[serde_compat]") {
const auto path = testBinaryInputPath + "aos_empty_strings_java.sk";
auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) {
REQUIRE_FALSE(sketch.is_empty());
REQUIRE_FALSE(sketch.is_estimation_mode());
REQUIRE(sketch.get_num_retained() == 3);
const std::vector<std::vector<std::string>> expected_values = {
{"empty_key_value"},
{""},
{"", ""}
};
std::vector<bool> matched(expected_values.size(), false);
for (const auto& entry: sketch) {
REQUIRE(entry.first < sketch.get_theta64());
bool found = false;
for (size_t i = 0; i < expected_values.size(); ++i) {
if (matched[i]) continue;
const auto& expected = expected_values[i];
if (entry.second.size() != expected.size()) continue;
bool equal = true;
for (size_t j = 0; j < expected.size(); ++j) {
if (entry.second[j] != expected[j]) {
equal = false;
break;
}
}
if (equal) {
matched[i] = true;
found = true;
break;
}
}
REQUIRE(found);
}
for (bool found: matched) REQUIRE(found);
};
SECTION("stream") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(path, std::ios::binary);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
is, DEFAULT_SEED, default_array_of_strings_serde<>()
);
check(sketch);
}
SECTION("bytes") {
const auto bytes = read_binary_file(path);
const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(
bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>()
);
check(sketch);
}
}
}