| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <stdint.h> |
| |
| #include <iostream> |
| #include <limits> |
| #include <memory> |
| #include <vector> |
| |
| #include <immintrin.h> |
| |
| #include "util/benchmark.h" |
| #include "util/cpu-info.h" |
| #include "util/hash-util.h" |
| #include "util/sse-util.h" |
| |
| using namespace std; |
| using namespace impala; |
| |
| // Test hash functions that take integers as arguments and produce integers as the result. |
| // |
| // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz |
| // 32 -> 32: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile |
| // (relative) (relative) (relative) |
| // --------------------------------------------------------------------------------------------------------- |
| // FNV 25.7 26 26.4 1X 1X 1X |
| // Zobrist 30.2 30.7 30.9 1.18X 1.18X 1.17X |
| // MultRot 42.6 43.4 43.4 1.66X 1.67X 1.64X |
| // MultiplyAddShift 42.4 43.2 43.2 1.65X 1.66X 1.64X |
| // Jenkins1 51.8 54 54 2.02X 2.08X 2.05X |
| // Jenkins2 66.2 67.4 67.5 2.58X 2.59X 2.56X |
| // CRC 98.6 100 101 3.84X 3.85X 3.84X |
| // MultiplyShift 150 152 153 5.84X 5.86X 5.79X |
| // |
| // 32 x 4 -> 32 x 4: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile |
| // (relative) (relative) (relative) |
| // --------------------------------------------------------------------------------------------------------- |
| // (Multiple<Zobrist, 4>) 30.8 31.2 31.4 1X 1X 1X |
| // (Multiple<MultiplyAddShift, 4>) 44.2 45 45 1.43X 1.44X 1.43X |
| // (Multiple<CRC, 4>) 118 120 121 3.84X 3.86X 3.85X |
| // (Multiple<MultiplyShift, 4>) 156 159 159 5.07X 5.1X 5.08X |
| // MultiplyAddShift128 75.7 77.2 77.2 2.46X 2.48X 2.46X |
| // MultiplyShift128 128 131 133 4.16X 4.21X 4.23X |
| // |
| // 32 x 8 -> 32 x 8: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile |
| // (relative) (relative) (relative) |
| // --------------------------------------------------------------------------------------------------------- |
| // (Multiple<Zobrist, 8>) 31 31.5 31.8 1X 1X 1X |
| // (Multiple<MultiplyAddShift, 8>) 44.3 44.5 45.2 1.43X 1.41X 1.42X |
| // (Multiple<CRC, 8>) 121 123 123 3.9X 3.9X 3.88X |
| // (Multiple<MultiplyShift, 8>) 158 159 160 5.11X 5.05X 5.04X |
| // Zobrist256simple 16.5 16.5 16.6 0.533X 0.524X 0.522X |
| // Zobrist256gather 18.8 19.2 19.4 0.608X 0.608X 0.61X |
| // MultiplyAddShift256 151 154 154 4.88X 4.88X 4.84X |
| // MultiplyShift256 209 212 212 6.73X 6.71X 6.67X |
| |
| // Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, but that isn't |
| // usable on Clang yet. Fortunately, both GCC and Clang can optimize this to use the 'ror' |
| // instruction. |
| template<int SHIFT> |
| inline uint32_t RotateRight(uint32_t x) { |
| static_assert(SHIFT > 0, "Only positive shifts are defined behavior and useful"); |
| static_assert( |
| SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is just 0"); |
| return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> SHIFT); |
| } |
| |
| // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits |
| // produced by rand(). |
| uint32_t MakeRandU32() { |
| uint32_t result = (rand() >> 8) & 0xffff; |
| result <<= 16; |
| result |= (rand() >> 8) & 0xffff; |
| return result; |
| } |
| |
| // Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and M. |
| // Penttonen, "A reliable randomized algorithm for the closest-pair problem". |
| inline void MultiplyShift(uint32_t* x) { |
| static const uint32_t m = 0x61eaf8e9u; |
| *x = *x * m; |
| } |
| |
| // Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once. |
| // |
| // TODO: Add the Intel intrinsics used in this function and the other functions in this |
| // file to sse-util.h so that these functions can be inlined. |
| inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1"))); |
| inline void MultiplyShift128(__m128i* x) { |
| const __m128i m = _mm_set1_epi32(0x61eaf8e9); |
| _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m)); |
| } |
| |
| // Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once. |
| // |
| // Not inline, because it degrades the performance for unknown reasons. |
| void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2"))); |
| void MultiplyShift256(__m256i* x) { |
| const __m256i m = _mm256_set1_epi32(0x61eaf8e9); |
| _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m)); |
| } |
| |
| // 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise independent |
| // random variables via integer arithmetic without primes" |
| inline void MultiplyAddShift(uint32_t* x) { |
| static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull; |
| *x = (static_cast<uint64_t>(*x) * m + a) >> 32; |
| } |
| |
| // Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at once. |
| inline void MultiplyAddShift128(__m128i* x) __attribute__((__target__("sse4.1"))); |
| inline void MultiplyAddShift128(__m128i* x) { |
| const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull), |
| mhi = _mm_set1_epi32(0xa1f1bd3e), |
| a = _mm_set1_epi64x(0x86b0426193d86e66ull); |
| auto input = _mm_loadu_si128(x); |
| auto prod32easy = _mm_mullo_epi32(input, mhi); |
| auto input_odds = _mm_srli_epi64(input, 32); |
| auto prod64_evens = _mm_mul_epu32(input, m), |
| prod64_odds = _mm_mul_epu32(input_odds, m); |
| prod64_evens = _mm_add_epi64(a, prod64_evens); |
| prod64_odds = _mm_add_epi64(a, prod64_odds); |
| auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds); |
| _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard)); |
| } |
| |
| // Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at once. |
| inline void MultiplyAddShift256(__m256i* x) __attribute__((__target__("avx2"))); |
| inline void MultiplyAddShift256(__m256i* x) { |
| const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull), |
| mhi = _mm256_set1_epi32(0xa1f1bd3e), |
| a = _mm256_set1_epi64x(0x86b0426193d86e66ull); |
| __m256i input = _mm256_loadu_si256(x); |
| __m256i prod32easy = _mm256_mullo_epi32(input, mhi); |
| __m256i input_odds = _mm256_srli_epi64(input, 32); |
| __m256i prod64_evens = _mm256_mul_epu32(input, m), |
| prod64_odds = _mm256_mul_epu32(input_odds, m); |
| prod64_evens = _mm256_add_epi64(a, prod64_evens); |
| prod64_odds = _mm256_add_epi64(a, prod64_odds); |
| __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds); |
| _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard)); |
| } |
| |
| // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: |
| inline void Jenkins1(int32_t* x) { |
| *x = ~*x + (*x << 15); // x = (x << 15) - x - 1; |
| *x = *x ^ RotateRight<12>(*x); |
| *x = *x + (*x << 2); |
| *x = *x ^ RotateRight<4>(*x); |
| *x = *x * 2057; // x = (x + (x << 3)) + (x << 11); |
| *x = *x ^ RotateRight<16>(*x); |
| } |
| |
| // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: |
| inline void Jenkins2(uint32_t* a) { |
| *a = (*a + 0x7ed55d16) + (*a << 12); |
| *a = (*a ^ 0xc761c23c) ^ (*a >> 19); |
| *a = (*a + 0x165667b1) + (*a << 5); |
| *a = (*a + 0xd3a2646c) ^ (*a << 9); |
| *a = (*a + 0xfd7046c5) + (*a << 3); |
| *a = (*a ^ 0xb55a4f09) ^ (*a >> 16); |
| } |
| |
| // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: |
| inline void MultRot(int32_t* key) { |
| static const int32_t c2 = 0x27d4eb2d; // a prime or an odd constant |
| *key = (*key ^ 61) ^ RotateRight<16>(*key); |
| *key = *key + (*key << 3); |
| *key = *key ^ RotateRight<4>(*key); |
| *key = *key * c2; |
| *key = *key ^ RotateRight<15>(*key); |
| } |
| |
| inline void CRC(uint32_t* x) { |
| *x = SSE4_crc32_u32(*x, 0xab8ce2abu); |
| } |
| |
| inline void FNV(uint32_t* key) { |
| *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED); |
| } |
| |
| // Zobrist hashing, also known as tabulation hashing or simple tabulation hashing, is an |
| // old technique that has been recently analyzed and found to be very good for a number of |
| // applications. See "The Power of Simple Tabulation Hashing", by Mihai Patrascu and |
| // Mikkel Thorup. |
| |
| uint32_t ZOBRIST_DATA[4][256]; |
| |
| inline void Zobrist(uint32_t* key) { |
| const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key); |
| *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]] |
| ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]]; |
| } |
| |
| // Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once. |
| inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2"))); |
| inline void Zobrist256gather(__m256i* key) { |
| const auto k = _mm256_loadu_si256(key); |
| const auto low_mask = _mm256_set1_epi32(0xff); |
| auto k0 = _mm256_and_si256(low_mask, k), |
| k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)), |
| k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)), |
| k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24)); |
| k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), k0, 1); |
| k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), k1, 1); |
| k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), k2, 1); |
| k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), k3, 1); |
| auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3); |
| _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23)); |
| } |
| |
| // Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather. |
| inline void Zobrist256simple(uint32_t (*key)[8]) __attribute__((__target__("avx2"))); |
| inline void Zobrist256simple(uint32_t (*key)[8]) { |
| uint32_t row[4][8]; |
| const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t (*)[8][4]>(key); |
| for (int i = 0; i < 4; ++i) { |
| for (int j = 0; j < 8; ++j) { |
| row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]]; |
| } |
| } |
| auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])), |
| result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])), |
| result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])), |
| result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3])); |
| auto k01 = _mm256_xor_si256(result0, result1), k23 = _mm256_xor_si256(result2, result3); |
| _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, k23)); |
| } |
| |
| // Perform one hash function the given number of times. This can sometimes auto-vectorize. |
| // |
| // TODO: We could also test the costs of running on non-contiguous uint32_t's. For |
| // instance, ExprValuesCache.expr_values_array_ might have multiple values to hash per |
| // input row. |
| template <void (*F)(uint32_t*), size_t N> |
| inline void Multiple(uint32_t (*x)[N]) { |
| for (int i = 0; i < N; ++i) { |
| F((*x) + i); |
| } |
| } |
| |
| // The size of the test data we run each hash function on: |
| static const size_t DATA_SIZE = 1 << 15; |
| |
| template<typename T, void (*HASH)(T*)> |
| void Run(int batch_size, void* data) { |
| T* d = reinterpret_cast<T*>(data); |
| for (int i = 0; i < batch_size; ++i) { |
| for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) { |
| HASH(&d[j]); |
| } |
| } |
| } |
| |
| int main() { |
| CpuInfo::Init(); |
| cout << endl |
| << Benchmark::GetMachineInfo() << endl; |
| |
| unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]); |
| for (size_t i = 0; i < (DATA_SIZE); ++i) { |
| ud[i] = MakeRandU32(); |
| } |
| |
| for (size_t i = 0; i < 4; ++i) { |
| for (size_t j = 0; j < 256; ++j) { |
| ZOBRIST_DATA[i][j] = MakeRandU32(); |
| } |
| } |
| |
| Benchmark suite32("32 -> 32"); |
| |
| #define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get()) |
| |
| suite32.BENCH(uint32_t, FNV); |
| suite32.BENCH(uint32_t, Zobrist); |
| suite32.BENCH(int32_t, MultRot); |
| suite32.BENCH(uint32_t, MultiplyAddShift); |
| suite32.BENCH(int32_t, Jenkins1); |
| suite32.BENCH(uint32_t, Jenkins2); |
| if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC); |
| suite32.BENCH(uint32_t, MultiplyShift); |
| |
| cout << suite32.Measure() << endl; |
| |
| Benchmark suite32x4("32 x 4 -> 32 x 4"); |
| |
| suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>)); |
| suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>)); |
| if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) { |
| suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>)); |
| } |
| suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>)); |
| if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) { |
| suite32x4.BENCH(__m128i, MultiplyAddShift128); |
| suite32x4.BENCH(__m128i, MultiplyShift128); |
| } |
| |
| cout << suite32x4.Measure() << endl; |
| |
| Benchmark suite32x8("32 x 8 -> 32 x 8"); |
| |
| suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>)); |
| suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>)); |
| suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>)); |
| suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>)); |
| if (CpuInfo::IsSupported(CpuInfo::AVX2)) { |
| suite32x8.BENCH(uint32_t[8], Zobrist256simple); |
| suite32x8.BENCH(__m256i, Zobrist256gather); |
| suite32x8.BENCH(__m256i, MultiplyAddShift256); |
| suite32x8.BENCH(__m256i, MultiplyShift256); |
| } |
| |
| cout << suite32x8.Measure() << endl; |
| |
| #undef BENCH |
| } |