blob: a5841a42f589e08ed43dac8c4e0f3f0a8960b7ca [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include "runtime/string-value.h"
#include "util/benchmark.h"
#include "util/cpu-info.h"
#include "util/hash-util.h"
#include "common/names.h"
using namespace impala;
// Benchmark for testing internal representation of strings. This is prototype
// code and should eventually be merged into StringValue
// In this case there are 10x as many short strings are long strings.
// Results:
// Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
// String Test: Function Rate Comparison
// ----------------------------------------------------------------------
// Normal Sequential 31.54 1X
// Compact Sequential 30.44 0.9651X
// Normal Random 16.09 0.51X
// Compact Random 23.13 0.7331X
// This is a string representation for compact strings. If the string is
// sufficiently short (less than STORAGE_SIZE - 1 bytes), it will be stored
// inline. Otherwise, it will be stored as the len/ptr. This only supports
// strings up to 2^15 in length (only 2 bytes are used to store the len) and
// relies on the fact that the upper 16 bits of addresses are unused on x64.
template <int STORAGE_SIZE>
struct CompactStringValue {
private:
union {
char bytes_[STORAGE_SIZE];
struct {
// The upper bit is used to encode whether this is stored inline or not.
// If the bit is set, it is inlined.
long is_inline_ : 1;
long inline_len_ : 7;
char inline_data_[STORAGE_SIZE - 1];
};
struct {
long dummy_ : 1; // Lines up with is_inline_
long indirect_len_ : 15;
long indirect_ptr_ : 48;
// Rest of STORAGE_SIZE is unused. This is a minimum of 8 bytes.
// TODO: we could try to adapt this to support longer lengths if STORAGE_SIZE is
// greater than 8.
};
};
public:
CompactStringValue(const char* str) {
long len = strlen(str);
if (len < STORAGE_SIZE) {
memcpy(inline_data_, str, len);
inline_len_ = len;
is_inline_ = true;
} else {
indirect_ptr_ = reinterpret_cast<long>(str);
indirect_len_ = len;
is_inline_ = false;
}
}
const char* ptr() const {
if (is_inline_) return inline_data_;
return reinterpret_cast<const char*>(indirect_ptr_);
}
int len() const {
if (is_inline_) return inline_len_;
return indirect_len_;
}
};
struct TestData {
vector<StringValue> normal_strings;
vector<CompactStringValue<8>> compact_strings;
vector<int> random_order;
uint32_t hash_normal;
uint32_t hash_compact;
vector<string> string_data;
};
void TestNormalStringsSequential(int batch_size, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
for (int i = 0; i < batch_size; ++i) {
data->hash_normal = 0;
for (int j = 0; j < data->normal_strings.size(); ++j) {
const StringValue& str = data->normal_strings[j];
data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
}
}
}
void TestCompactStringsSequential(int batch_size, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
for (int i = 0; i < batch_size; ++i) {
data->hash_compact = 0;
for (int j = 0; j < data->compact_strings.size(); ++j) {
const CompactStringValue<8>& str = data->compact_strings[j];
data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
}
}
}
void TestNormalStringsRandom(int batch_size, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
for (int i = 0; i < batch_size; ++i) {
data->hash_normal = 0;
for (int j = 0; j < data->normal_strings.size(); ++j) {
const StringValue& str = data->normal_strings[data->random_order[j]];
data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
}
}
}
void TestCompactStringsRandom(int batch_size, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
for (int i = 0; i < batch_size; ++i) {
data->hash_compact = 0;
for (int j = 0; j < data->compact_strings.size(); ++j) {
const CompactStringValue<8>& str = data->compact_strings[data->random_order[j]];
data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
}
}
}
void AddTestString(TestData* data, const char* s) {
data->compact_strings.push_back(s);
data->normal_strings.push_back(StringValue(const_cast<char*>(s), strlen(s)));
}
// This creates more short strings than long strings.
void InitTestData(TestData* data, int num_small_strings, int num_large_strings) {
for (int i = 0; i < num_small_strings; ++i) {
data->string_data.push_back("small");
}
for (int i = 0; i < num_large_strings; ++i) {
// We don't need this to be too large as to minimize the crc time. It just
// needs to be large enough to trigger then indirect string path.
data->string_data.push_back("large-large-large");
}
for (int i = 0; i < num_small_strings; ++i) {
AddTestString(data, data->string_data[i].c_str());
}
for (int i = 0; i < num_large_strings; ++i) {
AddTestString(data, data->string_data[i + num_large_strings].c_str());
}
data->random_order.resize(data->normal_strings.size());
for (int i = 0; i < data->random_order.size(); ++i) {
data->random_order[i] = i;
}
random_shuffle(data->random_order.begin(), data->random_order.end());
}
int main(int argc, char **argv) {
CpuInfo::Init();
cout << Benchmark::GetMachineInfo() << endl;
TestData data;
InitTestData(&data, 10000, 100);
Benchmark suite("String Test");
suite.AddBenchmark("Normal Sequential", TestNormalStringsSequential, &data);
suite.AddBenchmark("Compact Sequential", TestCompactStringsSequential, &data);
suite.AddBenchmark("Normal Random", TestNormalStringsRandom, &data);
suite.AddBenchmark("Compact Random", TestCompactStringsRandom, &data);
cout << suite.Measure();
if (data.hash_normal != data.hash_compact) {
cout << "Uh oh - this is broken." << endl;
return 1;
}
return 0;
}