| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/Hash.h |
| // and modified by Doris |
| |
| #pragma once |
| |
| #include <type_traits> |
| |
| #include "core/extended_types.h" |
| #include "core/string_ref.h" |
| #include "core/types.h" |
| #include "core/uint128.h" |
| #include "parallel_hashmap/phmap_utils.h" |
| |
| // Here is an empirical value. |
| static constexpr size_t HASH_MAP_PREFETCH_DIST = 16; |
| |
| /** Hash functions that are better than the trivial function std::hash. |
| * |
| * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times. |
| * This is because of following reasons: |
| * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits; |
| * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits; |
| * - traffic is non-uniformly distributed across a day; |
| * - we are using open-addressing linear probing hash tables that are most critical to hash function quality, |
| * and trivial hash function gives disastrous results. |
| */ |
| |
| /** Taken from MurmurHash. This is Murmur finalizer. |
| * Faster than int_hash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID. |
| */ |
| inline doris::UInt64 int_hash64(doris::UInt64 x) { |
| x ^= x >> 33; |
| x *= 0xff51afd7ed558ccdULL; |
| x ^= x >> 33; |
| x *= 0xc4ceb9fe1a85ec53ULL; |
| x ^= x >> 33; |
| |
| return x; |
| } |
| |
| /** CRC32C is not very high-quality as a hash function, |
| * according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits, |
| * but can behave well when used in hash tables, |
| * due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle). |
| * Works only with SSE 4.2 support. |
| */ |
| #include "util/sse_util.hpp" |
| |
| inline doris::UInt64 int_hash_crc32(doris::UInt64 x) { |
| #if defined(__SSE4_2__) || (defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)) |
| return _mm_crc32_u64(-1ULL, x); |
| #else |
| /// On other platforms we do not have CRC32. NOTE This can be confusing. |
| return int_hash64(x); |
| #endif |
| } |
| |
| template <typename T> |
| inline size_t default_hash64(T key) { |
| union { |
| T in; |
| doris::UInt64 out; |
| } u; |
| u.out = 0; |
| u.in = key; |
| return int_hash64(u.out); |
| } |
| |
| template <typename T, typename Enable = void> |
| struct DefaultHash; |
| |
| template <typename T> |
| requires std::is_arithmetic_v<T> |
| struct DefaultHash<T> { |
| size_t operator()(T key) const { return default_hash64<T>(key); } |
| }; |
| |
| template <> |
| struct DefaultHash<doris::VecDateTimeValue> { |
| size_t operator()(doris::VecDateTimeValue key) const { return int_hash64(*(int64_t*)&key); } |
| }; |
| |
| template <> |
| struct DefaultHash<doris::DateV2Value<doris::DateTimeV2ValueType>> { |
| size_t operator()(doris::DateV2Value<doris::DateTimeV2ValueType> key) const { |
| return int_hash64(key.to_date_int_val()); |
| } |
| }; |
| |
| template <> |
| struct DefaultHash<doris::DateV2Value<doris::DateV2ValueType>> { |
| size_t operator()(doris::DateV2Value<doris::DateV2ValueType> key) const { |
| return int_hash64(key.to_date_int_val()); |
| } |
| }; |
| |
| template <> |
| struct DefaultHash<doris::TimestampTzValue> { |
| size_t operator()(doris::TimestampTzValue key) const { |
| return int_hash64(key.to_date_int_val()); |
| } |
| }; |
| |
| template <> |
| struct DefaultHash<doris::StringRef> : public doris::StringRefHash {}; |
| |
| template <> |
| struct DefaultHash<wide::Int256> : public std::hash<wide::Int256> {}; |
| |
| template <typename T> |
| struct HashCRC32; |
| |
| template <typename T> |
| inline size_t hash_crc32(T key) { |
| union { |
| T in; |
| doris::UInt64 out; |
| } u; |
| u.out = 0; |
| u.in = key; |
| return int_hash_crc32(u.out); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::UInt128 u) { |
| return doris::UInt128HashCRC32()(u); |
| } |
| |
| template <> |
| inline size_t hash_crc32(unsigned __int128 u) { |
| return doris::UInt128HashCRC32()(u); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::Int128 u) { |
| return doris::UInt128HashCRC32()({(u >> 64) & int64_t(-1), u & int64_t(-1)}); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::VecDateTimeValue u) { |
| return hash_crc32(*(int64_t*)&u); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::DateV2Value<doris::DateTimeV2ValueType> u) { |
| return hash_crc32(u.to_date_int_val()); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::DateV2Value<doris::DateV2ValueType> u) { |
| return hash_crc32(u.to_date_int_val()); |
| } |
| |
| template <> |
| inline size_t hash_crc32(doris::TimestampTzValue u) { |
| return hash_crc32(u.to_date_int_val()); |
| } |
| |
| #define DEFINE_HASH(T) \ |
| template <> \ |
| struct HashCRC32<T> { \ |
| size_t operator()(T key) const { \ |
| return hash_crc32<T>(key); \ |
| } \ |
| }; |
| |
| DEFINE_HASH(doris::UInt8) |
| DEFINE_HASH(doris::UInt16) |
| DEFINE_HASH(doris::UInt32) |
| DEFINE_HASH(doris::UInt64) |
| DEFINE_HASH(doris::UInt128) |
| DEFINE_HASH(doris::Int8) |
| DEFINE_HASH(doris::Int16) |
| DEFINE_HASH(doris::Int32) |
| DEFINE_HASH(doris::Int64) |
| DEFINE_HASH(doris::Int128) |
| DEFINE_HASH(doris::Float32) |
| DEFINE_HASH(doris::Float64) |
| DEFINE_HASH(doris::VecDateTimeValue) |
| DEFINE_HASH(doris::DateV2Value<doris::DateTimeV2ValueType>) |
| DEFINE_HASH(doris::DateV2Value<doris::DateV2ValueType>) |
| DEFINE_HASH(doris::TimestampTzValue) |
| DEFINE_HASH(unsigned __int128) |
| |
| #undef DEFINE_HASH |
| |
| template <typename Key, typename Hash = HashCRC32<Key>> |
| struct HashMixWrapper { |
| size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::UInt256> { |
| size_t operator()(const doris::UInt256& x) const { |
| #if defined(__SSE4_2__) || defined(__aarch64__) |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u64(crc, x.items[0]); |
| crc = _mm_crc32_u64(crc, x.items[1]); |
| crc = _mm_crc32_u64(crc, x.items[2]); |
| crc = _mm_crc32_u64(crc, x.items[3]); |
| return crc; |
| #else |
| return Hash128to64({Hash128to64({x.a, x.b}), Hash128to64({x.c, x.d})}); |
| #endif |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<wide::Int256> { |
| size_t operator()(const wide::Int256& x) const { |
| #if defined(__SSE4_2__) || defined(__aarch64__) |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u64(crc, x.items[0]); |
| crc = _mm_crc32_u64(crc, x.items[1]); |
| crc = _mm_crc32_u64(crc, x.items[2]); |
| crc = _mm_crc32_u64(crc, x.items[3]); |
| return crc; |
| #else |
| return Hash128to64( |
| {Hash128to64({x.items[0], x.items[1]}), Hash128to64({x.items[2], x.items[3]})}); |
| #endif |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::Decimal256> { |
| size_t operator()(const doris::Decimal256& value) const { |
| return HashCRC32<wide::Int256>()(value.value); |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::Decimal32> { |
| size_t operator()(const doris::Decimal32& value) const { |
| return HashCRC32<int32_t>()(value.value); |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::Decimal64> { |
| size_t operator()(const doris::Decimal64& value) const { |
| return HashCRC32<int64_t>()(value.value); |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::Decimal128V3> { |
| size_t operator()(const doris::Decimal128V3& value) const { |
| return HashCRC32<__int128>()(value.value); |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::Decimal128V2> { |
| size_t operator()(const doris::Decimal128V2& value) const { |
| return HashCRC32<__int128>()(value.value); |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::DecimalV2Value> { |
| size_t operator()(const doris::DecimalV2Value& value) const { |
| return HashCRC32<__int128>()(value.value()); |
| } |
| }; |
| |
| #include "common/compile_check_avoid_begin.h" |
| |
| template <> |
| struct HashCRC32<doris::UInt72> { |
| size_t operator()(const doris::UInt72& x) const { |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u8(crc, x.a); |
| crc = _mm_crc32_u64(crc, x.b); |
| return crc; |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::UInt96> { |
| size_t operator()(const doris::UInt96& x) const { |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u32(crc, x.a); |
| crc = _mm_crc32_u64(crc, x.b); |
| return crc; |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::UInt104> { |
| size_t operator()(const doris::UInt104& x) const { |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u8(crc, x.a); |
| crc = _mm_crc32_u32(crc, x.b); |
| crc = _mm_crc32_u64(crc, x.c); |
| return crc; |
| } |
| }; |
| |
| template <> |
| struct HashCRC32<doris::UInt136> { |
| size_t operator()(const doris::UInt136& x) const { |
| doris::UInt64 crc = -1ULL; |
| crc = _mm_crc32_u8(crc, x.a); |
| crc = _mm_crc32_u64(crc, x.b); |
| crc = _mm_crc32_u64(crc, x.c); |
| return crc; |
| } |
| }; |
| |
| #include "common/compile_check_avoid_end.h" |