blob: 1c38c195b0407b39f9f9a7405dd30b881c080883 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <iostream>
#include <algorithm>
#include <random>
#include <chrono>
#include <sstream>
#include <frequent_items_sketch.hpp>
#include "frequent_items_sketch_timing_profile.hpp"
#include "zipf_distribution.hpp"
namespace datasketches {
typedef frequent_items_sketch<long long> frequent_longs_sketch;
void frequent_items_sketch_timing_profile::run() {
const unsigned lg_min_stream_len = 0;
const unsigned lg_max_stream_len = 23;
const unsigned ppo = 16;
const unsigned lg_max_trials = 14;
const unsigned lg_min_trials = 8;
const unsigned lg_max_sketch_size = 10;
const unsigned zipf_lg_range = 13; // range: 8K values for 1K sketch
const double zipf_exponent = 0.7;
const double geom_p = 0.005;
std::default_random_engine generator(std::chrono::system_clock::now().time_since_epoch().count());
std::geometric_distribution<long long> geometric_distribution(geom_p);
zipf_distribution zipf(1 << zipf_lg_range, zipf_exponent);
std::cout << "StreamLen\tTrials\tBuild\tUpdate\tSerStream\tDeserStream\tSerBytes\tDeserBytes\tMaxErr\tNumItems\tSizeBytes" << std::endl;
size_t stream_length = 1 << lg_min_stream_len;
while (stream_length <= 1 << lg_max_stream_len) {
std::chrono::nanoseconds build_time_ns(0);
std::chrono::nanoseconds update_time_ns(0);
std::chrono::nanoseconds stream_serialize_time_ns(0);
std::chrono::nanoseconds stream_deserialize_time_ns(0);
std::chrono::nanoseconds bytes_serialize_time_ns(0);
std::chrono::nanoseconds bytes_deserialize_time_ns(0);
unsigned num_items = 0;
size_t size_bytes = 0;
size_t max_error = 0;
const size_t num_trials = get_num_trials(stream_length, lg_min_stream_len, lg_max_stream_len, lg_min_trials, lg_max_trials);
long long* values = new long long[stream_length];
for (size_t i = 0; i < num_trials; i++) {
const auto start_build(std::chrono::high_resolution_clock::now());
frequent_longs_sketch sketch(lg_max_sketch_size);
const auto finish_build(std::chrono::high_resolution_clock::now());
build_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_build - start_build);
// prepare values to exclude cost of random generator from the update loop
for (size_t j = 0; j < stream_length; j++) {
values[j] = zipf.sample();
}
const auto start_update(std::chrono::high_resolution_clock::now());
for (size_t j = 0; j < stream_length; ++j) {
sketch.update(values[j]);
}
const auto finish_update(std::chrono::high_resolution_clock::now());
update_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_update - start_update);
{
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
auto start_stream_serialize(std::chrono::high_resolution_clock::now());
sketch.serialize(s);
const auto finish_stream_serialize(std::chrono::high_resolution_clock::now());
stream_serialize_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_stream_serialize - start_stream_serialize);
const auto start_stream_deserialize(std::chrono::high_resolution_clock::now());
auto deserialized_sketch = frequent_longs_sketch::deserialize(s);
const auto finish_stream_deserialize(std::chrono::high_resolution_clock::now());
stream_deserialize_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_stream_deserialize - start_stream_deserialize);
size_bytes += s.tellp();
}
{
auto start_bytes_serialize(std::chrono::high_resolution_clock::now());
auto bytes = sketch.serialize();
const auto finish_bytes_serialize(std::chrono::high_resolution_clock::now());
bytes_serialize_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_bytes_serialize - start_bytes_serialize);
const auto start_bytes_deserialize(std::chrono::high_resolution_clock::now());
auto deserialized_sketch = frequent_longs_sketch::deserialize(bytes.data(), bytes.size());
const auto finish_bytes_deserialize(std::chrono::high_resolution_clock::now());
bytes_deserialize_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_bytes_deserialize - start_bytes_deserialize);
}
num_items += sketch.get_num_active_items();
max_error += sketch.get_maximum_error();
}
delete [] values;
std::cout << stream_length << "\t"
<< num_trials << "\t"
<< (double) build_time_ns.count() / num_trials << "\t"
<< (double) update_time_ns.count() / num_trials / stream_length << "\t"
<< (double) stream_serialize_time_ns.count() / num_trials << "\t"
<< (double) stream_deserialize_time_ns.count() / num_trials << "\t"
<< (double) bytes_serialize_time_ns.count() / num_trials << "\t"
<< (double) bytes_deserialize_time_ns.count() / num_trials << "\t"
<< (double) max_error / num_trials << "\t"
<< (double) num_items / num_trials << "\t"
<< (double) size_bytes / num_trials << "\t"
<< std::endl;
stream_length = pwr_2_law_next(ppo, stream_length);
}
}
}