blob: 1664abd0447ce4ca412e6c61798e11742d426abb [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_UTIL_HASH_UTIL_H
#define KUDU_UTIL_HASH_UTIL_H
#include <cstdint>
// Including glog/logging.h causes problems while compiling in Apache Impala for codegen.
// IWYU pragma: no_include <glog/logging.h>
#include "kudu/gutil/port.h"
#include "kudu/util/hash.pb.h"
#include "kudu/util/slice.h"
namespace kudu {
// Constant imported from Apache Impala used to compute hash values for special cases. It's an
// arbitrary constant obtained by taking lower bytes of generated UUID. Helps distinguish NULL
// values from empty objects.
// Impala uses the direct BlockBloomFilter API and inserts hash value directly using its own
// implementation of the Fast hash. Hence the value must match with Impala.
//
// Note: Since address of this static constexpr variable is used, declaring this as
// a member variable of HashUtil requires an explicit definition in .cc file
// and this class is completely defined in the header file to allow inlining.
static constexpr uint32_t kHashValNull = 0x58081667;
// Utility class to compute hash values.
class HashUtil {
public:
// Murmur2 hash implementation returning 64-bit hashes.
ATTRIBUTE_NO_SANITIZE_INTEGER
static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
static constexpr uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995UL;
static constexpr int MURMUR_R = 47;
uint64_t h = seed ^ (len * MURMUR_PRIME);
const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
const uint64_t* end = data + (len / sizeof(uint64_t));
while (data != end) {
uint64_t k = *data++;
k *= MURMUR_PRIME;
k ^= k >> MURMUR_R;
k *= MURMUR_PRIME;
h ^= k;
h *= MURMUR_PRIME;
}
const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
switch (len & 7) {
case 7: h ^= static_cast<uint64_t>(data2[6]) << 48;
case 6: h ^= static_cast<uint64_t>(data2[5]) << 40;
case 5: h ^= static_cast<uint64_t>(data2[4]) << 32;
case 4: h ^= static_cast<uint64_t>(data2[3]) << 24;
case 3: h ^= static_cast<uint64_t>(data2[2]) << 16;
case 2: h ^= static_cast<uint64_t>(data2[1]) << 8;
case 1: h ^= static_cast<uint64_t>(data2[0]);
h *= MURMUR_PRIME;
}
h ^= h >> MURMUR_R;
h *= MURMUR_PRIME;
h ^= h >> MURMUR_R;
return h;
}
// FastHash is simple, robust, and efficient general-purpose hash function from Google.
// Implementation is adapted from https://code.google.com/archive/p/fast-hash/
//
// Adds special handling for nullptr input.
//
// Compute 64-bit FastHash.
ATTRIBUTE_NO_SANITIZE_INTEGER
static uint64_t FastHash64(const void* buf, size_t len, uint64_t seed) {
// Special handling for nullptr input with possible non-zero length as could be the
// case with nullable column values.
if (buf == nullptr) {
buf = &kHashValNull;
len = sizeof(kHashValNull);
}
static constexpr uint64_t kMultiplier = 0x880355f21e6d1965UL;
const uint64_t* pos = static_cast<const uint64_t*>(buf);
const uint64_t* end = pos + (len / 8);
uint64_t h = seed ^ (len * kMultiplier);
uint64_t v;
while (pos != end) {
v = *pos++;
h ^= FastHashMix(v);
h *= kMultiplier;
}
const uint8_t* pos2 = reinterpret_cast<const uint8_t*>(pos);
v = 0;
switch (len & 7) {
case 7: v ^= static_cast<uint64_t>(pos2[6]) << 48;
case 6: v ^= static_cast<uint64_t>(pos2[5]) << 40;
case 5: v ^= static_cast<uint64_t>(pos2[4]) << 32;
case 4: v ^= static_cast<uint64_t>(pos2[3]) << 24;
case 3: v ^= static_cast<uint64_t>(pos2[2]) << 16;
case 2: v ^= static_cast<uint64_t>(pos2[1]) << 8;
case 1: v ^= static_cast<uint64_t>(pos2[0]);
h ^= FastHashMix(v);
h *= kMultiplier;
}
return FastHashMix(h);
}
// Compute 32-bit FastHash.
static uint32_t FastHash32(const void* buf, size_t len, uint32_t seed) {
// the following trick converts the 64-bit hashcode to Fermat
// residue, which shall retain information from both the higher
// and lower parts of hashcode.
uint64_t h = FastHash64(buf, len, seed);
return h - (h >> 32);
}
// Checks whether 32-bit version of the hash algorithm is available.
// Must be kept in sync with ComputeHash32() function.
static bool IsComputeHash32Available(HashAlgorithm hash_algorithm) {
switch (hash_algorithm) {
case FAST_HASH:
return true;
default:
return false;
}
}
// Compute 32-bit hash of the supplied data using the specified hash algorithm.
// Must be kept in sync with IsComputeHash32Available() function.
static uint32_t ComputeHash32(const Slice& data, HashAlgorithm hash_algorithm, uint32_t seed) {
switch (hash_algorithm) {
case FAST_HASH:
return FastHash32(data.data(), data.size(), seed);
default:
// Can't use LOG(FATAL)/CHECK() since including glog/logging.h causes problems
// with code-gen in Impala.
abort();
}
}
private:
// Compression function for Merkle-Damgard construction.
ATTRIBUTE_NO_SANITIZE_INTEGER
static uint64_t FastHashMix(uint64_t h) {
h ^= h >> 23;
h *= 0x2127599bf4325c37UL;
h ^= h >> 47;
return h;
}
};
} // namespace kudu
#endif