be/src/benchmarks/string-benchmark.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <algorithm>
 #include <stdlib.h>
 #include <stdio.h>
 #include <iostream>
 #include "runtime/string-value.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
 #include "util/hash-util.h"

 #include "common/names.h"

 using namespace impala;

 // Benchmark for testing internal representation of strings.  This is prototype
 // code and should eventually be merged into StringValue

 // In this case there are 10x as many short strings are long strings.
 // Results:
 // Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
 // String Test:          Function                Rate          Comparison
 // ----------------------------------------------------------------------
 //              Normal Sequential               31.54                  1X
 //             Compact Sequential               30.44             0.9651X
 //                  Normal Random               16.09               0.51X
 //                 Compact Random               23.13             0.7331X

 // This is a string representation for compact strings.  If the string is
 // sufficiently short (less than STORAGE_SIZE - 1 bytes), it will be stored
 // inline.  Otherwise, it will be stored as the len/ptr.  This only supports
 // strings up to 2^15 in length (only 2 bytes are used to store the len) and
 // relies on the fact that the upper 16 bits of addresses are unused on x64.
 template <int STORAGE_SIZE>
 struct CompactStringValue {
  private:
   union {
     char bytes_[STORAGE_SIZE];
     struct {
       // The upper bit is used to encode whether this is stored inline or not.
       // If the bit is set, it is inlined.
       long is_inline_       : 1;
       long inline_len_      : 7;
       char inline_data_[STORAGE_SIZE - 1];
     };
     struct {
       long dummy_           : 1;            // Lines up with is_inline_
       long indirect_len_    : 15;
       long indirect_ptr_    : 48;
       // Rest of STORAGE_SIZE is unused.  This is a minimum of 8 bytes.
       // TODO: we could try to adapt this to support longer lengths if STORAGE_SIZE is
       // greater than 8.
     };
   };

  public:
   CompactStringValue(const char* str) {
     long len = strlen(str);
     if (len < STORAGE_SIZE) {
       memcpy(inline_data_, str, len);
       inline_len_ = len;
       is_inline_ = true;
     } else {
       indirect_ptr_ = reinterpret_cast<long>(str);
       indirect_len_ = len;
       is_inline_ = false;
     }
   }

   const char* ptr() const {
     if (is_inline_) return inline_data_;
     return reinterpret_cast<const char*>(indirect_ptr_);
   }

   int len() const {
     if (is_inline_) return inline_len_;
     return indirect_len_;
   }
 };

 struct TestData {
   vector<StringValue> normal_strings;
   vector<CompactStringValue<8>> compact_strings;
   vector<int> random_order;
   uint32_t hash_normal;
   uint32_t hash_compact;
   vector<string> string_data;
 };

 void TestNormalStringsSequential(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     data->hash_normal = 0;
     for (int j = 0; j < data->normal_strings.size(); ++j) {
       const StringValue& str = data->normal_strings[j];
       data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
     }
   }
 }

 void TestCompactStringsSequential(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     data->hash_compact = 0;
     for (int j = 0; j < data->compact_strings.size(); ++j) {
       const CompactStringValue<8>& str = data->compact_strings[j];
       data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
     }
   }
 }

 void TestNormalStringsRandom(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     data->hash_normal = 0;
     for (int j = 0; j < data->normal_strings.size(); ++j) {
       const StringValue& str = data->normal_strings[data->random_order[j]];
       data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
     }
   }
 }

 void TestCompactStringsRandom(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   for (int i = 0; i < batch_size; ++i) {
     data->hash_compact = 0;
     for (int j = 0; j < data->compact_strings.size(); ++j) {
       const CompactStringValue<8>& str = data->compact_strings[data->random_order[j]];
       data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
     }
   }
 }

 void AddTestString(TestData* data, const char* s) {
   data->compact_strings.push_back(s);
   data->normal_strings.push_back(StringValue(const_cast<char*>(s), strlen(s)));
 }

 // This creates more short strings than long strings.
 void InitTestData(TestData* data, int num_small_strings, int num_large_strings) {
   for (int i = 0; i < num_small_strings; ++i) {
     data->string_data.push_back("small");
   }
   for (int i = 0; i < num_large_strings; ++i) {
     // We don't need this to be too large as to minimize the crc time.  It just
     // needs to be large enough to trigger then indirect string path.
     data->string_data.push_back("large-large-large");
   }

   for (int i = 0; i < num_small_strings; ++i) {
     AddTestString(data, data->string_data[i].c_str());
   }

   for (int i = 0; i < num_large_strings; ++i) {
     AddTestString(data, data->string_data[i + num_large_strings].c_str());
   }

   data->random_order.resize(data->normal_strings.size());
   for (int i = 0; i < data->random_order.size(); ++i) {
     data->random_order[i] = i;
   }
   random_shuffle(data->random_order.begin(), data->random_order.end());
 }

 int main(int argc, char **argv) {
   CpuInfo::Init();
   cout << Benchmark::GetMachineInfo() << endl;

   TestData data;
   InitTestData(&data, 10000, 100);

   Benchmark suite("String Test");
   suite.AddBenchmark("Normal Sequential", TestNormalStringsSequential, &data);
   suite.AddBenchmark("Compact Sequential", TestCompactStringsSequential, &data);
   suite.AddBenchmark("Normal Random", TestNormalStringsRandom, &data);
   suite.AddBenchmark("Compact Random", TestCompactStringsRandom, &data);
   cout << suite.Measure();

   if (data.hash_normal != data.hash_compact) {
     cout << "Uh oh - this is broken." << endl;
     return 1;
   }

   return 0;
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <algorithm>
	#include <stdlib.h>
	#include <stdio.h>
	#include <iostream>
	#include "runtime/string-value.h"
	#include "util/benchmark.h"
	#include "util/cpu-info.h"
	#include "util/hash-util.h"

	#include "common/names.h"

	using namespace impala;

	// Benchmark for testing internal representation of strings. This is prototype
	// code and should eventually be merged into StringValue

	// In this case there are 10x as many short strings are long strings.
	// Results:
	// Machine Info: Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
	// String Test: Function Rate Comparison
	// ----------------------------------------------------------------------
	// Normal Sequential 31.54 1X
	// Compact Sequential 30.44 0.9651X
	// Normal Random 16.09 0.51X
	// Compact Random 23.13 0.7331X

	// This is a string representation for compact strings. If the string is
	// sufficiently short (less than STORAGE_SIZE - 1 bytes), it will be stored
	// inline. Otherwise, it will be stored as the len/ptr. This only supports
	// strings up to 2^15 in length (only 2 bytes are used to store the len) and
	// relies on the fact that the upper 16 bits of addresses are unused on x64.
	template <int STORAGE_SIZE>
	struct CompactStringValue {
	private:
	union {
	char bytes_[STORAGE_SIZE];
	struct {
	// The upper bit is used to encode whether this is stored inline or not.
	// If the bit is set, it is inlined.
	long is_inline_ : 1;
	long inline_len_ : 7;
	char inline_data_[STORAGE_SIZE - 1];
	};
	struct {
	long dummy_ : 1; // Lines up with is_inline_
	long indirect_len_ : 15;
	long indirect_ptr_ : 48;
	// Rest of STORAGE_SIZE is unused. This is a minimum of 8 bytes.
	// TODO: we could try to adapt this to support longer lengths if STORAGE_SIZE is
	// greater than 8.
	};
	};

	public:
	CompactStringValue(const char* str) {
	long len = strlen(str);
	if (len < STORAGE_SIZE) {
	memcpy(inline_data_, str, len);
	inline_len_ = len;
	is_inline_ = true;
	} else {
	indirect_ptr_ = reinterpret_cast<long>(str);
	indirect_len_ = len;
	is_inline_ = false;
	}
	}

	const char* ptr() const {
	if (is_inline_) return inline_data_;
	return reinterpret_cast<const char*>(indirect_ptr_);
	}

	int len() const {
	if (is_inline_) return inline_len_;
	return indirect_len_;
	}
	};

	struct TestData {
	vector<StringValue> normal_strings;
	vector<CompactStringValue<8>> compact_strings;
	vector<int> random_order;
	uint32_t hash_normal;
	uint32_t hash_compact;
	vector<string> string_data;
	};

	void TestNormalStringsSequential(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	data->hash_normal = 0;
	for (int j = 0; j < data->normal_strings.size(); ++j) {
	const StringValue& str = data->normal_strings[j];
	data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
	}
	}
	}

	void TestCompactStringsSequential(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	data->hash_compact = 0;
	for (int j = 0; j < data->compact_strings.size(); ++j) {
	const CompactStringValue<8>& str = data->compact_strings[j];
	data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
	}
	}
	}

	void TestNormalStringsRandom(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	data->hash_normal = 0;
	for (int j = 0; j < data->normal_strings.size(); ++j) {
	const StringValue& str = data->normal_strings[data->random_order[j]];
	data->hash_normal = HashUtil::CrcHash(str.ptr, str.len, data->hash_normal);
	}
	}
	}

	void TestCompactStringsRandom(int batch_size, void* d) {
	TestData* data = reinterpret_cast<TestData*>(d);
	for (int i = 0; i < batch_size; ++i) {
	data->hash_compact = 0;
	for (int j = 0; j < data->compact_strings.size(); ++j) {
	const CompactStringValue<8>& str = data->compact_strings[data->random_order[j]];
	data->hash_compact = HashUtil::CrcHash(str.ptr(), str.len(), data->hash_compact);
	}
	}
	}

	void AddTestString(TestData* data, const char* s) {
	data->compact_strings.push_back(s);
	data->normal_strings.push_back(StringValue(const_cast<char*>(s), strlen(s)));
	}

	// This creates more short strings than long strings.
	void InitTestData(TestData* data, int num_small_strings, int num_large_strings) {
	for (int i = 0; i < num_small_strings; ++i) {
	data->string_data.push_back("small");
	}
	for (int i = 0; i < num_large_strings; ++i) {
	// We don't need this to be too large as to minimize the crc time. It just
	// needs to be large enough to trigger then indirect string path.
	data->string_data.push_back("large-large-large");
	}

	for (int i = 0; i < num_small_strings; ++i) {
	AddTestString(data, data->string_data[i].c_str());
	}

	for (int i = 0; i < num_large_strings; ++i) {
	AddTestString(data, data->string_data[i + num_large_strings].c_str());
	}

	data->random_order.resize(data->normal_strings.size());
	for (int i = 0; i < data->random_order.size(); ++i) {
	data->random_order[i] = i;
	}
	random_shuffle(data->random_order.begin(), data->random_order.end());
	}

	int main(int argc, char **argv) {
	CpuInfo::Init();
	cout << Benchmark::GetMachineInfo() << endl;

	TestData data;
	InitTestData(&data, 10000, 100);

	Benchmark suite("String Test");
	suite.AddBenchmark("Normal Sequential", TestNormalStringsSequential, &data);
	suite.AddBenchmark("Compact Sequential", TestCompactStringsSequential, &data);
	suite.AddBenchmark("Normal Random", TestNormalStringsRandom, &data);
	suite.AddBenchmark("Compact Random", TestCompactStringsRandom, &data);
	cout << suite.Measure();

	if (data.hash_normal != data.hash_compact) {
	cout << "Uh oh - this is broken." << endl;
	return 1;
	}

	return 0;
	}