blob: 05d5cfc8163fea343fe49d8a587836cccd1ff1ac [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <iostream>
#include <iomanip>
#include <set>
#include <catch2/catch.hpp>
#include <tuple_sketch.hpp>
#include <tuple_union.hpp>
#include <stdexcept>
template<typename T>
class max_value_policy {
public:
max_value_policy(const T& initial_value): initial_value(initial_value) {}
T create() const { return initial_value; }
void update(T& summary, const T& update) const { summary = std::max(summary, update); }
private:
T initial_value;
};
using max_float_update_tuple_sketch = datasketches::update_tuple_sketch<float, float, max_value_policy<float>>;
template<typename T>
class always_one_policy {
public:
always_one_policy(): initial_value(1) {}
T create() const { return 1; }
void update(T&, const T&) const { }
private:
T initial_value;
};
using always_one_tuple_sketch = datasketches::update_tuple_sketch<int, int, always_one_policy<int>> ;
template<typename T>
class update_sum_value_policy {
public:
update_sum_value_policy(): initial_value(0) {}
T create() const { return initial_value; }
void update(T& summary, const T& update) const { summary += update; }
private:
T initial_value;
};
using sum_update_tuple_sketch = datasketches::update_tuple_sketch<int, int, update_sum_value_policy<int>>;
template<typename Summary>
struct union_sum_value_policy {
void operator()(Summary& summary, const Summary& other) const {
summary += other;
}
};
using sum_union_tuple_sketch = datasketches::tuple_union<int, union_sum_value_policy<int>> ;
class EngagementTest{
public:
int num_std_dev = 2 ;
void test_always_one_update(){
/*
* Tests that updates into an update_tuple_sketch sketch only keeps a 1 in the column for stored values.
*/
int lgK = 8 ;
std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
auto always_one_sketch = always_one_tuple_sketch::builder(always_one_policy<int>()).set_lg_k(lgK).build() ;
always_one_sketch.update(1, 1);
always_one_sketch.update(1, 2);
always_one_sketch.update(2, 1);
always_one_sketch.update(3, 3);
always_one_sketch.update(3, 7);
int num_retained = 0;
int sum = 0;
for (const auto& entry: always_one_sketch) {
sum += entry.second;
++num_retained;
}
REQUIRE(num_retained == 3);
REQUIRE(sum == 3); // we only keep 1 for every stored key.
}
void test_sum_update_policy(){
/*
* Tests that updates into an sum_update_tuple_sketch sum the stored values on updates.
*/
int lgK = 8 ;
auto sum_sketch = sum_update_tuple_sketch::builder().set_lg_k(lgK).build() ;
sum_sketch.update(1, 1);
sum_sketch.update(1, 2);
sum_sketch.update(2, 1);
sum_sketch.update(3, 3);
sum_sketch.update(3, 7);
int num_retained = 0;
int sum = 0;
for (const auto& entry: sum_sketch) {
sum += entry.second;
++num_retained;
}
REQUIRE(num_retained == 3);
REQUIRE(sum == 14); // (1+2) + 1 + (3 + 7) = 14
}
void test_sum_union_policy(){
/*
* Tests that updates into two sketches of sum_update_tuple_sketch flavour, which have been unioned,
* cause the stored values of two of the same keys to be summed.
*/
auto sketch1 = sum_update_tuple_sketch::builder().build() ;
auto sketch2 = sum_update_tuple_sketch::builder().build() ;
sketch1.update(1, 1);
sketch1.update(2, 1);
sketch1.update(3, 3);
sketch2.update(1, 2);
sketch2.update(2, 1);
sketch2.update(3, 7);
auto union_sketch = sum_union_tuple_sketch::builder().build() ;
union_sketch.update(sketch1) ;
union_sketch.update(sketch2) ;
auto union_result = union_sketch.get_result() ;
int num_retained = 0;
int sum = 0;
for (const auto& entry: union_result) {
sum += entry.second;
++num_retained;
}
REQUIRE(num_retained == 3);
REQUIRE(sum == 15); // 1:(1+2) + 2:(1+1) + 3:(3+7) = 15
}
void compute_engagement_histogram(){
/*
* Returns the estimated histogram from the synthetic data.
* On inspection one can verify this agrees with the
* https://github.com/apache/datasketches-java/blob/master/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
*/
int lgK = 8 ;
const int days = 30 ;
int v = 0 ;
std::set<int> set_array[days];
std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketch_array ;
for(int i=0; i<days ; i++){
auto builder = always_one_tuple_sketch::builder(always_one_policy<int>()) ;
builder.set_lg_k(lgK) ;
auto sketch = builder.build() ;
sketch_array.push_back(sketch);
}
REQUIRE(sketch_array.size() == days) ;
for(int i=0; i<=days; i++){
int32_t num_ids = get_num_ids(days, i) ;
int32_t num_days = get_num_days(days, i) ;
int my_v = v++ ;
for(int d=0 ; d<num_days; d++){
for(int id = 0; id < num_ids; id++){
set_array[d].insert(my_v + id) ;
sketch_array[d].update(my_v + id, 1) ;
}
}
v += num_ids ;
}
union_ops(lgK, sketch_array) ;
}
private:
int32_t get_num_ids(int total_days, int index){
/*
* Generates power law distributed synthetic data
*/
double d = total_days ;
double i = index ;
return int(round(exp(i * log(d) / d))) ;
}
int32_t get_num_days(int total_days, int index){
double d = total_days ;
double i = index ;
return int(round(exp( (d-i) * log(d) / d ))) ;
}
int32_t round_double_to_int(double x){
return int(std::round(x)) ;
}
void union_ops(int lgk, std::vector<datasketches::update_tuple_sketch<int, int, always_one_policy<int>>> sketches){
int num_sketches = sketches.size() ;
auto u = sum_union_tuple_sketch::builder().set_lg_k(lgk).build() ;
for(auto sk:sketches){
u.update(sk) ;
}
auto union_result = u.get_result() ;
std::vector<uint64_t> num_days_arr(num_sketches+1) ;
for (const auto& entry: union_result) {
int num_days_visited = entry.second ;
num_days_arr[num_days_visited]++;
}
int sum_visits = 0;
double theta = union_result.get_theta();
std::cout <<"\t\tEngagement Histogram.\t\t\t\n" ;
std::cout << "Number of Unique Visitors by Number of Days Visited" << std::endl ;
std::cout << "---------------------------------------------------" << std::endl ;
std::cout << std::setw(12) << "Days Visited"
<< std::setw(12) << "Estimate"
<< std::setw(12) << "LB"
<< std::setw(12) << "UB"
<< std:: endl ;
for (uint64_t i = 0; i < num_days_arr.size(); i++) {
int visitors_at_days_visited = num_days_arr[i] ;
if(visitors_at_days_visited == 0){ continue; }
sum_visits += visitors_at_days_visited * i ;
double est_visitors_at_days_visited = visitors_at_days_visited / theta ;
double lower_bound_at_days_visited = union_result.get_lower_bound(num_std_dev, visitors_at_days_visited);
double upper_bound_at_days_visited = union_result.get_upper_bound(num_std_dev, visitors_at_days_visited);
std::cout << std::setw(12) << i
<< std::setw(12) << est_visitors_at_days_visited
<< std::setw(12) << lower_bound_at_days_visited
<< std::setw(12) << upper_bound_at_days_visited
<< std:: endl ;
}
std::cout << std::endl << std::endl ;
std::cout << std::setw(12) << "Totals"
<< std::setw(12) << "Estimate"
<< std::setw(12) << "LB"
<< std::setw(12) << "UB"
<< std:: endl ;
std::cout << "---------------------------------------------------" << std::endl ;
const double total_visitors = union_result.get_estimate() ;
const double lb_visitors = union_result.get_lower_bound(num_std_dev) ;
const double ub_visitors = union_result.get_upper_bound(num_std_dev) ;
std::cout << std::setw(12) << "Visitors"
<< std::setw(12) << total_visitors
<< std::setw(12) << lb_visitors
<< std::setw(12) << ub_visitors
<< std:: endl ;
// The total number of visits, however, is a scaled metric and takes advantage of the fact that
// the retained entries in the sketch is a uniform random sample of all unique visitors, and
// the rest of the unique users will likely behave in the same way.
const double est_visits = sum_visits / theta;
const double lb_visits = est_visits * lb_visitors / total_visitors;
const double ub_visits = est_visits * ub_visitors / total_visitors;
std::cout << std::setw(12) << "Visits"
<< std::setw(12) << est_visits
<< std::setw(12) << lb_visits
<< std::setw(12) << ub_visits
<< std:: endl ;
}
};
namespace datasketches {
TEST_CASE("engagement", "[engagement]") {
EngagementTest E ;
E.test_always_one_update() ;
E.test_sum_update_policy() ;
E.test_sum_union_policy() ;
E.compute_engagement_histogram() ;
}
} /* namespace datasketches */