| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #ifndef KUDU_UTIL_HASH_UTIL_H |
| #define KUDU_UTIL_HASH_UTIL_H |
| |
| #include <cstdint> |
| |
| // Including glog/logging.h causes problems while compiling in Apache Impala for codegen. |
| // IWYU pragma: no_include <glog/logging.h> |
| |
| #include "kudu/gutil/port.h" |
| #include "kudu/util/hash.pb.h" |
| #include "kudu/util/slice.h" |
| |
| namespace kudu { |
| |
| // Constant imported from Apache Impala used to compute hash values for special cases. It's an |
| // arbitrary constant obtained by taking lower bytes of generated UUID. Helps distinguish NULL |
| // values from empty objects. |
| // Impala uses the direct BlockBloomFilter API and inserts hash value directly using its own |
| // implementation of the Fast hash. Hence the value must match with Impala. |
| // |
| // Note: Since address of this static constexpr variable is used, declaring this as |
| // a member variable of HashUtil requires an explicit definition in .cc file |
| // and this class is completely defined in the header file to allow inlining. |
| static constexpr uint32_t kHashValNull = 0x58081667; |
| |
| // Utility class to compute hash values. |
| class HashUtil { |
| public: |
| |
| // Murmur2 hash implementation returning 64-bit hashes. |
| ATTRIBUTE_NO_SANITIZE_INTEGER |
| static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { |
| static constexpr uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995UL; |
| static constexpr int MURMUR_R = 47; |
| |
| uint64_t h = seed ^ (len * MURMUR_PRIME); |
| |
| const uint64_t* data = reinterpret_cast<const uint64_t*>(input); |
| const uint64_t* end = data + (len / sizeof(uint64_t)); |
| |
| while (data != end) { |
| uint64_t k = *data++; |
| k *= MURMUR_PRIME; |
| k ^= k >> MURMUR_R; |
| k *= MURMUR_PRIME; |
| h ^= k; |
| h *= MURMUR_PRIME; |
| } |
| |
| const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data); |
| switch (len & 7) { |
| case 7: h ^= static_cast<uint64_t>(data2[6]) << 48; |
| case 6: h ^= static_cast<uint64_t>(data2[5]) << 40; |
| case 5: h ^= static_cast<uint64_t>(data2[4]) << 32; |
| case 4: h ^= static_cast<uint64_t>(data2[3]) << 24; |
| case 3: h ^= static_cast<uint64_t>(data2[2]) << 16; |
| case 2: h ^= static_cast<uint64_t>(data2[1]) << 8; |
| case 1: h ^= static_cast<uint64_t>(data2[0]); |
| h *= MURMUR_PRIME; |
| } |
| |
| h ^= h >> MURMUR_R; |
| h *= MURMUR_PRIME; |
| h ^= h >> MURMUR_R; |
| return h; |
| } |
| |
| |
| // FastHash is simple, robust, and efficient general-purpose hash function from Google. |
| // Implementation is adapted from https://code.google.com/archive/p/fast-hash/ |
| // |
| // Adds special handling for nullptr input. |
| // |
| // Compute 64-bit FastHash. |
| ATTRIBUTE_NO_SANITIZE_INTEGER |
| static uint64_t FastHash64(const void* buf, size_t len, uint64_t seed) { |
| // Special handling for nullptr input with possible non-zero length as could be the |
| // case with nullable column values. |
| if (buf == nullptr) { |
| buf = &kHashValNull; |
| len = sizeof(kHashValNull); |
| } |
| static constexpr uint64_t kMultiplier = 0x880355f21e6d1965UL; |
| const uint64_t* pos = static_cast<const uint64_t*>(buf); |
| const uint64_t* end = pos + (len / 8); |
| uint64_t h = seed ^ (len * kMultiplier); |
| uint64_t v; |
| |
| while (pos != end) { |
| v = *pos++; |
| h ^= FastHashMix(v); |
| h *= kMultiplier; |
| } |
| |
| const uint8_t* pos2 = reinterpret_cast<const uint8_t*>(pos); |
| v = 0; |
| |
| switch (len & 7) { |
| case 7: v ^= static_cast<uint64_t>(pos2[6]) << 48; |
| case 6: v ^= static_cast<uint64_t>(pos2[5]) << 40; |
| case 5: v ^= static_cast<uint64_t>(pos2[4]) << 32; |
| case 4: v ^= static_cast<uint64_t>(pos2[3]) << 24; |
| case 3: v ^= static_cast<uint64_t>(pos2[2]) << 16; |
| case 2: v ^= static_cast<uint64_t>(pos2[1]) << 8; |
| case 1: v ^= static_cast<uint64_t>(pos2[0]); |
| h ^= FastHashMix(v); |
| h *= kMultiplier; |
| } |
| |
| return FastHashMix(h); |
| } |
| |
| // Compute 32-bit FastHash. |
| static uint32_t FastHash32(const void* buf, size_t len, uint32_t seed) { |
| // the following trick converts the 64-bit hashcode to Fermat |
| // residue, which shall retain information from both the higher |
| // and lower parts of hashcode. |
| uint64_t h = FastHash64(buf, len, seed); |
| return h - (h >> 32); |
| } |
| |
| // Checks whether 32-bit version of the hash algorithm is available. |
| // Must be kept in sync with ComputeHash32() function. |
| static bool IsComputeHash32Available(HashAlgorithm hash_algorithm) { |
| switch (hash_algorithm) { |
| case FAST_HASH: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| // Compute 32-bit hash of the supplied data using the specified hash algorithm. |
| // Must be kept in sync with IsComputeHash32Available() function. |
| static uint32_t ComputeHash32(const Slice& data, HashAlgorithm hash_algorithm, uint32_t seed) { |
| switch (hash_algorithm) { |
| case FAST_HASH: |
| return FastHash32(data.data(), data.size(), seed); |
| default: |
| // Can't use LOG(FATAL)/CHECK() since including glog/logging.h causes problems |
| // with code-gen in Impala. |
| abort(); |
| } |
| } |
| |
| private: |
| // Compression function for Merkle-Damgard construction. |
| ATTRIBUTE_NO_SANITIZE_INTEGER |
| static uint64_t FastHashMix(uint64_t h) { |
| h ^= h >> 23; |
| h *= 0x2127599bf4325c37UL; |
| h ^= h >> 47; |
| return h; |
| } |
| }; |
| |
| } // namespace kudu |
| #endif |