be/src/experiments/hashing/cache-hash-test.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <stdlib.h>
 #include <stdio.h>
 #include <iostream>

 #include <vector>

 #include "cache-hash-table.h"
 #include "cache-hash-table.inline.h"
 #include "standard-hash-table.h"
 #include "standard-hash-table.inline.h"
 #include "tuple-types.h"
 #include "runtime/mem-pool.h"
 #include "util/cpu-info.h"
 #include "util/debug-util.h"
 #include "util/pretty-printer.h"
 #include "util/hash-util.h"
 #include "util/runtime-profile.h"
 #include "util/stopwatch.h"

 using namespace impala;

 // Very basic hash aggregation prototype and test
 // TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.

 CacheHashTable::CacheHashTable() {
   num_content_allocated_ = 0;
 }

 void CacheHashTable::BucketSizeDistribution() {
   std::vector<int> bucket_size;
   for (int i = 0; i < BUCKETS; ++i) {
     int size = buckets_[i].count;
     if (size >= bucket_size.size()) {
       // grow bucket_size to fit this size
       bucket_size.resize(size + 1, 0);
     }
     ++bucket_size[size];
   }

   std::stringstream distr;
   for (int i = 0; i < bucket_size.size(); ++i) {
     distr << i << ": " << bucket_size[i] << "\n";
   }
   LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
 }


 // Update ht, which is doing a COUNT(*) GROUP BY id,
 // by having it process the new tuple probe.
 // Templatized on the type of hash table so we can reuse code without virtual calls.
 template<typename T>
 inline void Process(T* ht, const ProbeTuple* probe) {
   BuildTuple *existing = ht->Find(probe);
   if (existing != NULL) {
     ++existing->count;
   } else {
     BuildTuple build;
     build.id = probe->id;
     build.count = 1;
     ht->Insert(&build);
   }
 }

 // Test ht by aggregating input, which is an array of num_tuples ProbeTuples
 // Templatized on the type of hash table so we can reuse code without virtual calls.
 template<typename T>
 uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples) {
   StopWatch time;
   time.Start();
   for (int i = 0; i < num_tuples; ++i) {
     Process<T>(ht, &input[i]);
   }
   time.Stop();
   return time.ElapsedTime();
 }

 int main(int argc, char **argv) {
   google::InitGoogleLogging(argv[0]);
   CpuInfo::Init();

   srand(time(NULL));

   const int NUM_TUPLES = 100000000; //10^8
   const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;

   CacheHashTable cache_ht;
   StandardHashTable std_ht;

   ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
   uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
   LOG(ERROR) << "Cache-aware time: "
              << PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
   uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);

   LOG(ERROR) << "Bucket-chained time: "
              << PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
   return 0;
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <stdlib.h>
	#include <stdio.h>
	#include <iostream>

	#include <vector>

	#include "cache-hash-table.h"
	#include "cache-hash-table.inline.h"
	#include "standard-hash-table.h"
	#include "standard-hash-table.inline.h"
	#include "tuple-types.h"
	#include "runtime/mem-pool.h"
	#include "util/cpu-info.h"
	#include "util/debug-util.h"
	#include "util/pretty-printer.h"
	#include "util/hash-util.h"
	#include "util/runtime-profile.h"
	#include "util/stopwatch.h"

	using namespace impala;

	// Very basic hash aggregation prototype and test
	// TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.

	CacheHashTable::CacheHashTable() {
	num_content_allocated_ = 0;
	}

	void CacheHashTable::BucketSizeDistribution() {
	std::vector<int> bucket_size;
	for (int i = 0; i < BUCKETS; ++i) {
	int size = buckets_[i].count;
	if (size >= bucket_size.size()) {
	// grow bucket_size to fit this size
	bucket_size.resize(size + 1, 0);
	}
	++bucket_size[size];
	}

	std::stringstream distr;
	for (int i = 0; i < bucket_size.size(); ++i) {
	distr << i << ": " << bucket_size[i] << "\n";
	}
	LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
	}


	// Update ht, which is doing a COUNT(*) GROUP BY id,
	// by having it process the new tuple probe.
	// Templatized on the type of hash table so we can reuse code without virtual calls.
	template<typename T>
	inline void Process(T* ht, const ProbeTuple* probe) {
	BuildTuple *existing = ht->Find(probe);
	if (existing != NULL) {
	++existing->count;
	} else {
	BuildTuple build;
	build.id = probe->id;
	build.count = 1;
	ht->Insert(&build);
	}
	}

	// Test ht by aggregating input, which is an array of num_tuples ProbeTuples
	// Templatized on the type of hash table so we can reuse code without virtual calls.
	template<typename T>
	uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples) {
	StopWatch time;
	time.Start();
	for (int i = 0; i < num_tuples; ++i) {
	Process<T>(ht, &input[i]);
	}
	time.Stop();
	return time.ElapsedTime();
	}

	int main(int argc, char **argv) {
	google::InitGoogleLogging(argv[0]);
	CpuInfo::Init();

	srand(time(NULL));

	const int NUM_TUPLES = 100000000; //10^8
	const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;

	CacheHashTable cache_ht;
	StandardHashTable std_ht;

	ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
	uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
	LOG(ERROR) << "Cache-aware time: "
	<< PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
	uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);

	LOG(ERROR) << "Bucket-chained time: "
	<< PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
	return 0;
	}