blob: 572af68deb59301a2bf9d7df4107990641891bd5 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include "cache-hash-table.h"
#include "cache-hash-table.inline.h"
#include "standard-hash-table.h"
#include "standard-hash-table.inline.h"
#include "tuple-types.h"
#include "runtime/mem-pool.h"
#include "util/cpu-info.h"
#include "util/debug-util.h"
#include "util/pretty-printer.h"
#include "util/hash-util.h"
#include "util/runtime-profile.h"
#include "util/stopwatch.h"
using namespace impala;
// Very basic hash aggregation prototype and test
// TODO: Generalize beyond hash aggregation, beyond hashing on the one column, etc.
CacheHashTable::CacheHashTable() {
num_content_allocated_ = 0;
}
void CacheHashTable::BucketSizeDistribution() {
std::vector<int> bucket_size;
for (int i = 0; i < BUCKETS; ++i) {
int size = buckets_[i].count;
if (size >= bucket_size.size()) {
// grow bucket_size to fit this size
bucket_size.resize(size + 1, 0);
}
++bucket_size[size];
}
std::stringstream distr;
for (int i = 0; i < bucket_size.size(); ++i) {
distr << i << ": " << bucket_size[i] << "\n";
}
LOG(INFO) << "Bucket Size Distribution\n" << distr.str();
}
// Update ht, which is doing a COUNT(*) GROUP BY id,
// by having it process the new tuple probe.
// Templatized on the type of hash table so we can reuse code without virtual calls.
template<typename T>
inline void Process(T* ht, const ProbeTuple* probe) {
BuildTuple *existing = ht->Find(probe);
if (existing != NULL) {
++existing->count;
} else {
BuildTuple build;
build.id = probe->id;
build.count = 1;
ht->Insert(&build);
}
}
// Test ht by aggregating input, which is an array of num_tuples ProbeTuples
// Templatized on the type of hash table so we can reuse code without virtual calls.
template<typename T>
uint64_t Test(T* ht, const ProbeTuple* input, uint64_t num_tuples) {
StopWatch time;
time.Start();
for (int i = 0; i < num_tuples; ++i) {
Process<T>(ht, &input[i]);
}
time.Stop();
return time.ElapsedTime();
}
int main(int argc, char **argv) {
google::InitGoogleLogging(argv[0]);
CpuInfo::Init();
srand(time(NULL));
const int NUM_TUPLES = 100000000; //10^8
const int NUM_BUILD_TUPLES = 4 * CacheHashTable::MaxBuildTuples() / 10;
CacheHashTable cache_ht;
StandardHashTable std_ht;
ProbeTuple* input = GenTuples(NUM_TUPLES, NUM_BUILD_TUPLES);
uint64_t cache_time = Test<CacheHashTable>(&cache_ht, input, NUM_TUPLES);
LOG(ERROR) << "Cache-aware time: "
<< PrettyPrinter::Print(cache_time, TUnit::CPU_TICKS);
uint64_t std_time = Test<StandardHashTable>(&std_ht, input, NUM_TUPLES);
LOG(ERROR) << "Bucket-chained time: "
<< PrettyPrinter::Print(std_time, TUnit::CPU_TICKS);
return 0;
}