blob: 28a523d723d5714ac1f989642fbc96a32d2f8d21 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// From Apache Impala as of 2016-02-22
#include <cstdint>
#include "parquet/util/compiler-util.h"
#include "parquet/util/cpu-info.h"
#include "parquet/util/logging.h"
#include "parquet/util/sse-util.h"
namespace parquet {
/// Utility class to compute hash values.
class HashUtil {
/// Compute the Crc32 hash for data using SSE4 instructions. The input hash
/// parameter is the current hash/seed value.
/// This should only be called if SSE is supported.
/// This is ~4x faster than Fnv/Boost Hash.
/// TODO: crc32 hashes with different seeds do not result in different hash functions.
/// The resulting hashes are correlated.
/// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
uint32_t words = bytes / sizeof(uint32_t);
bytes = bytes % sizeof(uint32_t);
const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
while (words--) {
hash = SSE4_crc32_u32(hash, *p);
const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
while (bytes--) {
hash = SSE4_crc32_u8(hash, *s);
// The lower half of the CRC hash has has poor uniformity, so swap the halves
// for anyone who only uses the first several bits of the hash.
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 1-byte data
static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
hash = SSE4_crc32_u8(hash, *s);
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 2-byte data
static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
hash = SSE4_crc32_u16(hash, *s);
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 4-byte data
static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
hash = SSE4_crc32_u32(hash, *p);
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 8-byte data
static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 12-byte data
static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
hash = (hash << 16) | (hash >> 16);
return hash;
/// CrcHash() specialized for 16-byte data
static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
hash = SSE4_crc32_u64(hash, *p);
hash = (hash << 16) | (hash >> 16);
return hash;
static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995;
static const int MURMUR_R = 47;
/// Murmur2 hash implementation returning 64-bit hashes.
static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
uint64_t h = seed ^ (len * MURMUR_PRIME);
const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
const uint64_t* end = data + (len / sizeof(uint64_t));
while (data != end) {
uint64_t k = *data++;
k ^= k >> MURMUR_R;
h ^= k;
const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
switch (len & 7) {
case 7:
h ^= uint64_t(data2[6]) << 48;
case 6:
h ^= uint64_t(data2[5]) << 40;
case 5:
h ^= uint64_t(data2[4]) << 32;
case 4:
h ^= uint64_t(data2[3]) << 24;
case 3:
h ^= uint64_t(data2[2]) << 16;
case 2:
h ^= uint64_t(data2[1]) << 8;
case 1:
h ^= uint64_t(data2[0]);
h ^= h >> MURMUR_R;
h ^= h >> MURMUR_R;
return h;
/// default values recommended by
static const uint32_t FNV_PRIME = 0x01000193; // 16777619
static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261
static const uint64_t FNV64_PRIME = 1099511628211UL;
static const uint64_t FNV64_SEED = 14695981039346656037UL;
/// Implementation of the Fowler-Noll-Vo hash function. This is not as performant
/// as boost's hash on int types (2x slower) but has bit entropy.
/// For ints, boost just returns the value of the int which can be pathological.
/// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
/// is taken on the hash, all values will collide to the same bucket.
/// For string values, Fnv is slightly faster than boost.
/// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit,
/// which can lead to poor results when input bytes are duplicated.
/// See FnvHash64to32() for how this can be mitigated.
static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) {
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
while (bytes--) {
hash = (*ptr ^ hash) * FNV64_PRIME;
return hash;
/// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits.
/// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the
/// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs.
/// The input seed 'hash' is duplicated so the top half of the seed is not all zero.
/// Data length must be at least 1 byte: zero-length data should be handled separately,
/// for example using CombineHash with a unique constant value to avoid returning the
/// hash argument. Zero-length data gives terrible results: the initial hash value is
/// xored with itself cancelling all bits.
static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) {
// IMPALA-2270: this function should never be used for zero-byte inputs.
DCHECK_GT(bytes, 0);
uint64_t hash_u64 = hash | ((uint64_t)hash << 32);
hash_u64 = FnvHash64(data, bytes, hash_u64);
return (hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF);
/// Computes the hash value for data. Will call either CrcHash or MurmurHash
/// depending on hardware capabilities.
/// Seed values for different steps of the query execution should use different seeds
/// to prevent accidental key collisions. (See IMPALA-219 for more details).
static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
return CrcHash(data, bytes, seed);
} else {
return MurmurHash2_64(data, bytes, seed);
return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
/// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
/// Combine hashes 'value' and 'seed' to get a new hash value. Similar to
/// boost::hash_combine(), but for uint32_t. This function should be used with a
/// constant first argument to update the hash value for zero-length values such as
/// NULL, boolean, and empty strings.
static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) {
return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2));
// Get 32 more bits of randomness from a 32-bit hash:
static inline uint32_t Rehash32to32(const uint32_t hash) {
// Constants generated by uuidgen(1) with the -r flag
static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull;
// This is strongly universal hashing following Dietzfelbinger's "Universal hashing
// and k-wise independent random variables via integer arithmetic without primes". As
// such, for any two distinct uint32_t's hash1 and hash2, the probability (over the
// randomness of the constants) that any subset of bit positions of
// Rehash32to32(hash1) is equal to the same subset of bit positions
// Rehash32to32(hash2) is minimal.
return (static_cast<uint64_t>(hash) * m + a) >> 32;
static inline uint64_t Rehash32to64(const uint32_t hash) {
static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull,
a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull;
const uint64_t hash1 = (static_cast<uint64_t>(hash) * m1 + a1) >> 32;
const uint64_t hash2 = (static_cast<uint64_t>(hash) * m2 + a2) >> 32;
return hash1 | (hash2 << 32);
} // namespace parquet