blob: 8c2b9bfdef247cbc85bd99d80991765e3fee8929 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <catch.hpp>
#include <sstream>
#include <fstream>
#include "frequent_items_sketch.hpp"
#ifdef TEST_BINARY_INPUT_PATH
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
#else
static std::string testBinaryInputPath = "test/";
#endif
namespace datasketches {
TEST_CASE("frequent items: invalid k", "[frequent_items_sketch]") {
REQUIRE_THROWS_AS(frequent_items_sketch<int>(2), std::invalid_argument);
}
TEST_CASE("frequent items: empty", "[frequent_items_sketch]") {
frequent_items_sketch<int> sketch(3);
REQUIRE(sketch.is_empty());
REQUIRE(sketch.get_num_active_items() == 0);
REQUIRE(sketch.get_total_weight() == 0);
}
TEST_CASE("frequent items: one item", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch(3);
sketch.update("a");
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_num_active_items() == 1);
REQUIRE(sketch.get_total_weight() == 1);
REQUIRE(sketch.get_estimate("a") == 1);
REQUIRE(sketch.get_lower_bound("a") == 1);
REQUIRE(sketch.get_upper_bound("a") == 1);
}
TEST_CASE("frequent items: several items, no resize, no purge", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch(3);
sketch.update("a");
sketch.update("b");
sketch.update("c");
sketch.update("d");
sketch.update("b");
sketch.update("c");
sketch.update("b");
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 7);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate("a") == 1);
REQUIRE(sketch.get_estimate("b") == 3);
REQUIRE(sketch.get_estimate("c") == 2);
REQUIRE(sketch.get_estimate("d") == 1);
}
TEST_CASE("frequent items: several items, with resize, no purge", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch(4);
sketch.update("a");
sketch.update("b");
sketch.update("c");
sketch.update("d");
sketch.update("b");
sketch.update("c");
sketch.update("b");
sketch.update("e");
sketch.update("f");
sketch.update("g");
sketch.update("h");
sketch.update("i");
sketch.update("j");
sketch.update("k");
sketch.update("l");
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 15);
REQUIRE(sketch.get_num_active_items() == 12);
REQUIRE(sketch.get_estimate("a") == 1);
REQUIRE(sketch.get_estimate("b") == 3);
REQUIRE(sketch.get_estimate("c") == 2);
REQUIRE(sketch.get_estimate("d") == 1);
}
TEST_CASE("frequent items: estimation mode", "[frequent_items_sketch]") {
frequent_items_sketch<int> sketch(3);
sketch.update(1, 10);
sketch.update(2);
sketch.update(3);
sketch.update(4);
sketch.update(5);
sketch.update(6);
sketch.update(7, 15);
sketch.update(8);
sketch.update(9);
sketch.update(10);
sketch.update(11);
sketch.update(12);
REQUIRE(sketch.get_maximum_error() > 0); // estimation mode
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 35);
auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
REQUIRE(items.size() == 2); // only 2 items (1 and 7) should have counts more than 1
REQUIRE(items[0].get_item() == 7);
REQUIRE(items[0].get_estimate() == 15);
REQUIRE(items[1].get_item() == 1);
REQUIRE(items[1].get_estimate() == 10);
items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES);
REQUIRE(2 <= items.size()); // at least 2 items
REQUIRE(12 >= items.size()); // but not more than 12 items
}
TEST_CASE("frequent items: merge exact mode", "[frequent_items_sketch]") {
frequent_items_sketch<int> sketch1(3);
sketch1.update(1);
sketch1.update(2);
sketch1.update(3);
sketch1.update(4);
frequent_items_sketch<int> sketch2(3);
sketch1.update(2);
sketch1.update(3);
sketch1.update(2);
sketch1.merge(sketch2);
REQUIRE_FALSE(sketch1.is_empty());
REQUIRE(sketch1.get_total_weight() == 7);
REQUIRE(sketch1.get_num_active_items() == 4);
REQUIRE(sketch1.get_estimate(1) == 1);
REQUIRE(sketch1.get_estimate(2) == 3);
REQUIRE(sketch1.get_estimate(3) == 2);
REQUIRE(sketch1.get_estimate(4) == 1);
}
TEST_CASE("frequent items: merge estimation mode", "[frequent_items_sketch]") {
frequent_items_sketch<int> sketch1(4);
sketch1.update(1, 9); // to make sure it survives the purge
sketch1.update(2);
sketch1.update(3);
sketch1.update(4);
sketch1.update(5);
sketch1.update(6);
sketch1.update(7);
sketch1.update(8);
sketch1.update(9);
sketch1.update(10);
sketch1.update(11);
sketch1.update(12);
sketch1.update(13);
sketch1.update(14);
REQUIRE(sketch1.get_maximum_error() > 0); // estimation mode
frequent_items_sketch<int> sketch2(4);
sketch2.update(8);
sketch2.update(9);
sketch2.update(10);
sketch2.update(11);
sketch2.update(12);
sketch2.update(13);
sketch2.update(14);
sketch2.update(15);
sketch2.update(16);
sketch2.update(17);
sketch2.update(18);
sketch2.update(19);
sketch2.update(20);
sketch2.update(21, 11); // to make sure it survives the purge
REQUIRE(sketch2.get_maximum_error() > 0); // estimation mode
sketch1.merge(sketch2);
REQUIRE_FALSE(sketch1.is_empty());
REQUIRE(sketch1.get_total_weight() == 46);
REQUIRE(2 <= sketch1.get_num_active_items());
auto items = sketch1.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES, 2);
REQUIRE(items.size() == 2); // only 2 items (1 and 21) should be above threshold
REQUIRE(items[0].get_item() == 21);
REQUIRE(11 <= items[0].get_estimate()); // always overestimated
REQUIRE(items[1].get_item() == 1);
REQUIRE(9 <= items[1].get_estimate()); // always overestimated
}
TEST_CASE("frequent items: deserialize from java long", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "longs_sketch_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<long long>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 4);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate(1) == 1);
REQUIRE(sketch.get_estimate(2) == 1);
REQUIRE(sketch.get_estimate(3) == 1);
REQUIRE(sketch.get_estimate(4) == 1);
}
TEST_CASE("frequent items: deserialize from java string", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "items_sketch_string_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 4);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 1);
REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 1);
REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 1);
}
TEST_CASE("frequent items: deserialize from java string, utf-8", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "items_sketch_string_utf8_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 10);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate("абвгд") == 1);
REQUIRE(sketch.get_estimate("еёжзи") == 2);
REQUIRE(sketch.get_estimate("йклмн") == 3);
REQUIRE(sketch.get_estimate("опрст") == 4);
}
TEST_CASE("frequent items: deserialize long64 stream", "[frequent_items_sketch]") {
frequent_items_sketch<long long> sketch1(3);
sketch1.update(1, 1);
sketch1.update(2, 2);
sketch1.update(3, 3);
sketch1.update(4, 4);
sketch1.update(5, 5);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch1.serialize(s);
auto sketch2 = frequent_items_sketch<long long>::deserialize(s);
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE(sketch2.get_total_weight() == 15);
REQUIRE(sketch2.get_num_active_items() == 5);
REQUIRE(sketch2.get_estimate(1) == 1);
REQUIRE(sketch2.get_estimate(2) == 2);
REQUIRE(sketch2.get_estimate(3) == 3);
REQUIRE(sketch2.get_estimate(4) == 4);
REQUIRE(sketch2.get_estimate(5) == 5);
}
TEST_CASE("frequent items: serialize deserialiation long64 bytes", "[frequent_items_sketch]") {
frequent_items_sketch<long long> sketch1(3);
sketch1.update(1, 1);
sketch1.update(2, 2);
sketch1.update(3, 3);
sketch1.update(4, 4);
sketch1.update(5, 5);
auto bytes = sketch1.serialize();
auto sketch2 = frequent_items_sketch<long long>::deserialize(bytes.data(), bytes.size());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE(sketch2.get_total_weight() == 15);
REQUIRE(sketch2.get_num_active_items() == 5);
REQUIRE(sketch2.get_estimate(1) == 1);
REQUIRE(sketch2.get_estimate(2) == 2);
REQUIRE(sketch2.get_estimate(3) == 3);
REQUIRE(sketch2.get_estimate(4) == 4);
REQUIRE(sketch2.get_estimate(5) == 5);
}
TEST_CASE("frequent items: serialize deserialize string stream", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch1(3);
sketch1.update("aaaaaaaaaaaaaaaa", 1);
sketch1.update("bbbbbbbbbbbbbbbb", 2);
sketch1.update("cccccccccccccccc", 3);
sketch1.update("dddddddddddddddd", 4);
sketch1.update("eeeeeeeeeeeeeeee", 5);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch1.serialize(s);
auto sketch2 = frequent_items_sketch<std::string>::deserialize(s);
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE(sketch2.get_total_weight() == 15);
REQUIRE(sketch2.get_num_active_items() == 5);
REQUIRE(sketch2.get_estimate("aaaaaaaaaaaaaaaa") == 1);
REQUIRE(sketch2.get_estimate("bbbbbbbbbbbbbbbb") == 2);
REQUIRE(sketch2.get_estimate("cccccccccccccccc") == 3);
REQUIRE(sketch2.get_estimate("dddddddddddddddd") == 4);
REQUIRE(sketch2.get_estimate("eeeeeeeeeeeeeeee") == 5);
}
TEST_CASE("frequent items: serialize deserialize string bytes", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch1(3);
sketch1.update("aaaaaaaaaaaaaaaa", 1);
sketch1.update("bbbbbbbbbbbbbbbb", 2);
sketch1.update("cccccccccccccccc", 3);
sketch1.update("dddddddddddddddd", 4);
sketch1.update("eeeeeeeeeeeeeeee", 5);
auto bytes = sketch1.serialize();
auto sketch2 = frequent_items_sketch<std::string>::deserialize(bytes.data(), bytes.size());
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE(sketch2.get_total_weight() == 15);
REQUIRE(sketch2.get_num_active_items() == 5);
REQUIRE(sketch2.get_estimate("aaaaaaaaaaaaaaaa") == 1);
REQUIRE(sketch2.get_estimate("bbbbbbbbbbbbbbbb") == 2);
REQUIRE(sketch2.get_estimate("cccccccccccccccc") == 3);
REQUIRE(sketch2.get_estimate("dddddddddddddddd") == 4);
REQUIRE(sketch2.get_estimate("eeeeeeeeeeeeeeee") == 5);
}
TEST_CASE("frequent items: serialize deserialize string, utf-8 stream", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch1(3);
sketch1.update("абвгд", 1);
sketch1.update("еёжзи", 2);
sketch1.update("йклмн", 3);
sketch1.update("опрст", 4);
sketch1.update("уфхцч", 5);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
sketch1.serialize(s);
auto sketch2 = frequent_items_sketch<std::string>::deserialize(s);
REQUIRE_FALSE(sketch2.is_empty());
REQUIRE(sketch2.get_total_weight() == 15);
REQUIRE(sketch2.get_num_active_items() == 5);
REQUIRE(sketch2.get_estimate("абвгд") == 1);
REQUIRE(sketch2.get_estimate("еёжзи") == 2);
REQUIRE(sketch2.get_estimate("йклмн") == 3);
REQUIRE(sketch2.get_estimate("опрст") == 4);
REQUIRE(sketch2.get_estimate("уфхцч") == 5);
}
TEST_CASE("frequent items: int64 deserialize single item buffer overrun", "[frequent_items_sketch]") {
frequent_items_sketch<int64_t> sketch(3);
sketch.update(1);
auto bytes = sketch.serialize();
REQUIRE_THROWS_AS(frequent_items_sketch<int64_t>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
}
TEST_CASE("frequent items: string deserialize single item buffer overrun", "[frequent_items_sketch]") {
frequent_items_sketch<std::string> sketch(3);
sketch.update("a");
auto bytes = sketch.serialize();
REQUIRE_THROWS_AS(frequent_items_sketch<std::string>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
}
} /* namespace datasketches */