cpp/src/frequent_items_sketch_accuracy_profile.cpp - datasketches-characterization - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #include <iostream>
 #include <unordered_map>

 #include <frequent_items_sketch.hpp>

 #include "frequent_items_sketch_accuracy_profile.hpp"
 #include "zipf_distribution.hpp"

 namespace datasketches {

 void frequent_items_sketch_accuracy_profile::run() {
   const unsigned lg_num_sketches = 4; // merge if > 0 (more than 1 sketch)

   const unsigned lg_min_stream_len = 5;
   const unsigned lg_max_stream_len = 23;
   const unsigned ppo = (lg_num_sketches > 0) ? 1 : 16; // simple partitioning for merging relies on powers of 2

   const unsigned lg_max_trials = 18;
   const unsigned lg_min_trials = 10;

   const unsigned lg_max_sketch_size = 10;

   const unsigned zipf_lg_range = 13; // range: 8K values for 1K sketch
   const double zipf_exponent = 0.7;

   zipf_distribution zipf(1 << zipf_lg_range, zipf_exponent);

   size_t stream_length = 1 << lg_min_stream_len;
   while (stream_length <= 1 << lg_max_stream_len) {
     unsigned num_items = 0;
     unsigned max_error = 0;
     unsigned num_error_1 = 0;
     unsigned num_error_2 = 0;
     unsigned extra_items = 0;
     unsigned num_error_3 = 0;

     const size_t num_trials = get_num_trials(stream_length, lg_min_stream_len, lg_max_stream_len, lg_min_trials, lg_max_trials);

     // trust sketch to compute epsilon
     unsigned threshold = frequent_items_sketch<unsigned>::get_epsilon(lg_max_sketch_size) * stream_length;

     unsigned* values = new unsigned[stream_length];

     for (size_t i = 0; i < num_trials; i++) {
       // prepare values for this trial
       for (size_t j = 0; j < stream_length; j++) {
         values[j] = zipf.sample();
       }

       frequent_items_sketch<unsigned> sketch(lg_max_sketch_size);
       if (lg_num_sketches == 0) {
         for (size_t j = 0; j < stream_length; j++) {
           sketch.update(values[j]);
         }
       } else {
         // this partitioning relies on stream length being power of 2, so ppo must be 1
         const unsigned substream_length = stream_length / (1 << lg_num_sketches);
         for (size_t j = 0; j < (1 << lg_num_sketches); j++) {
           frequent_items_sketch<unsigned> s(lg_max_sketch_size);
           for (size_t k = 0; k < substream_length; k++) {
             s.update(values[j * substream_length + k]);
           }
           sketch.merge(s);
         }
       }
       num_items += sketch.get_num_active_items();
       max_error += sketch.get_maximum_error();

       // brute-force frequent items
       std::unordered_map<unsigned, unsigned> frequencies;
       for (size_t j = 0; j < stream_length; j++) {
         frequencies[values[j]]++;
       }
       std::unordered_map<unsigned, unsigned> frequent_items;
       for (auto it: frequencies) {
         if (it.second > threshold) frequent_items[it.first] = it.second;
       }

       // checks

       // using a priori threshold (conservative)

       // this must be a subset of the exact solution (only frequent items above threshold, not necessarily all of them)
       auto conservative_no_false_positives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES, threshold);
       for (auto& it: conservative_no_false_positives) {
         if (frequent_items.find(it.get_item()) == frequent_items.end()) {
           num_error_1++;
         }
       }

       // the exact solution must be a subset of this (all frequent items above threshold must be present, and possibly some extra items)
       auto conservative_no_false_negatives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES, threshold);
       std::unordered_map<unsigned, unsigned> conservative_no_false_negatives_map;
       for (auto& it: conservative_no_false_negatives) conservative_no_false_negatives_map[it.get_item()] = it.get_estimate();
       for (auto& it: frequent_items) {
         if (conservative_no_false_negatives_map.find(it.first) == conservative_no_false_negatives_map.end()) {
           num_error_2++;
         }
       }

       // using actual max error as threshold (the default)
       // this is expected to find more items compared to using conservative threshold

       auto no_false_positives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
       for (auto& it: no_false_positives) {
         unsigned item = it.get_item();
         if (frequent_items.find(item) == frequent_items.end()) {
           extra_items++;
         }
       }

       // all items of the exact solution must be in this set
       auto no_false_negatives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES);
       std::unordered_map<unsigned, unsigned> no_false_negatives_map;
       for (auto& it: no_false_negatives) no_false_negatives_map[it.get_item()] = it.get_estimate();
       for (auto& it: frequent_items) {
         if (no_false_negatives_map.find(it.first) == no_false_negatives_map.end()) {
           num_error_3++;
         }
       }
     }
     delete [] values;

     std::cout << stream_length
         << "\t" << num_trials
         << "\t" << (double) num_items / num_trials
         << "\t" << threshold
         << "\t" << (double) max_error / num_trials
         << "\t" << (double) num_error_1 / num_trials
         << "\t" << (double) num_error_2 / num_trials
         << "\t" << (double) extra_items / num_trials
         << "\t" << (double) num_error_3 / num_trials
         << std::endl;

     stream_length = pwr_2_law_next(ppo, stream_length);
   }
 }

 } /* namespace datasketches */
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#include <iostream>
	#include <unordered_map>

	#include <frequent_items_sketch.hpp>

	#include "frequent_items_sketch_accuracy_profile.hpp"
	#include "zipf_distribution.hpp"

	namespace datasketches {

	void frequent_items_sketch_accuracy_profile::run() {
	const unsigned lg_num_sketches = 4; // merge if > 0 (more than 1 sketch)

	const unsigned lg_min_stream_len = 5;
	const unsigned lg_max_stream_len = 23;
	const unsigned ppo = (lg_num_sketches > 0) ? 1 : 16; // simple partitioning for merging relies on powers of 2

	const unsigned lg_max_trials = 18;
	const unsigned lg_min_trials = 10;

	const unsigned lg_max_sketch_size = 10;

	const unsigned zipf_lg_range = 13; // range: 8K values for 1K sketch
	const double zipf_exponent = 0.7;

	zipf_distribution zipf(1 << zipf_lg_range, zipf_exponent);

	size_t stream_length = 1 << lg_min_stream_len;
	while (stream_length <= 1 << lg_max_stream_len) {
	unsigned num_items = 0;
	unsigned max_error = 0;
	unsigned num_error_1 = 0;
	unsigned num_error_2 = 0;
	unsigned extra_items = 0;
	unsigned num_error_3 = 0;

	const size_t num_trials = get_num_trials(stream_length, lg_min_stream_len, lg_max_stream_len, lg_min_trials, lg_max_trials);

	// trust sketch to compute epsilon
	unsigned threshold = frequent_items_sketch<unsigned>::get_epsilon(lg_max_sketch_size) * stream_length;

	unsigned* values = new unsigned[stream_length];

	for (size_t i = 0; i < num_trials; i++) {
	// prepare values for this trial
	for (size_t j = 0; j < stream_length; j++) {
	values[j] = zipf.sample();
	}

	frequent_items_sketch<unsigned> sketch(lg_max_sketch_size);
	if (lg_num_sketches == 0) {
	for (size_t j = 0; j < stream_length; j++) {
	sketch.update(values[j]);
	}
	} else {
	// this partitioning relies on stream length being power of 2, so ppo must be 1
	const unsigned substream_length = stream_length / (1 << lg_num_sketches);
	for (size_t j = 0; j < (1 << lg_num_sketches); j++) {
	frequent_items_sketch<unsigned> s(lg_max_sketch_size);
	for (size_t k = 0; k < substream_length; k++) {
	s.update(values[j * substream_length + k]);
	}
	sketch.merge(s);
	}
	}
	num_items += sketch.get_num_active_items();
	max_error += sketch.get_maximum_error();

	// brute-force frequent items
	std::unordered_map<unsigned, unsigned> frequencies;
	for (size_t j = 0; j < stream_length; j++) {
	frequencies[values[j]]++;
	}
	std::unordered_map<unsigned, unsigned> frequent_items;
	for (auto it: frequencies) {
	if (it.second > threshold) frequent_items[it.first] = it.second;
	}

	// checks

	// using a priori threshold (conservative)

	// this must be a subset of the exact solution (only frequent items above threshold, not necessarily all of them)
	auto conservative_no_false_positives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES, threshold);
	for (auto& it: conservative_no_false_positives) {
	if (frequent_items.find(it.get_item()) == frequent_items.end()) {
	num_error_1++;
	}
	}

	// the exact solution must be a subset of this (all frequent items above threshold must be present, and possibly some extra items)
	auto conservative_no_false_negatives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES, threshold);
	std::unordered_map<unsigned, unsigned> conservative_no_false_negatives_map;
	for (auto& it: conservative_no_false_negatives) conservative_no_false_negatives_map[it.get_item()] = it.get_estimate();
	for (auto& it: frequent_items) {
	if (conservative_no_false_negatives_map.find(it.first) == conservative_no_false_negatives_map.end()) {
	num_error_2++;
	}
	}

	// using actual max error as threshold (the default)
	// this is expected to find more items compared to using conservative threshold

	auto no_false_positives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES);
	for (auto& it: no_false_positives) {
	unsigned item = it.get_item();
	if (frequent_items.find(item) == frequent_items.end()) {
	extra_items++;
	}
	}

	// all items of the exact solution must be in this set
	auto no_false_negatives = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES);
	std::unordered_map<unsigned, unsigned> no_false_negatives_map;
	for (auto& it: no_false_negatives) no_false_negatives_map[it.get_item()] = it.get_estimate();
	for (auto& it: frequent_items) {
	if (no_false_negatives_map.find(it.first) == no_false_negatives_map.end()) {
	num_error_3++;
	}
	}
	}
	delete [] values;

	std::cout << stream_length
	<< "\t" << num_trials
	<< "\t" << (double) num_items / num_trials
	<< "\t" << threshold
	<< "\t" << (double) max_error / num_trials
	<< "\t" << (double) num_error_1 / num_trials
	<< "\t" << (double) num_error_2 / num_trials
	<< "\t" << (double) extra_items / num_trials
	<< "\t" << (double) num_error_3 / num_trials
	<< std::endl;

	stream_length = pwr_2_law_next(ppo, stream_length);
	}
	}

	} /* namespace datasketches */