blob: 73f1142d0a3ad452cc20e58f26f7289187a29118 [file] [log] [blame]
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* xxHash - Extremely Fast Hash algorithm
* Header File
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
// MMH_rotl32 utility is adapted from xxHash
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
#pragma once
#include <cassert>
#include <cstdint>
#include <cstring>
#include <memory>
#include "paimon/common/memory/memory_segment.h"
#include "paimon/memory/bytes.h"
namespace paimon {
#ifdef __has_builtin
#define MMH_HAS_BUILTIN(x) __has_builtin(x)
#else
#define MMH_HAS_BUILTIN(x) 0
#endif
/*!
* @internal
* @def MMH_rotl32(x,r)
* @brief 32-bit rotate left.
*
* @param x The 32-bit integer to be rotated.
* @param r The number of bits to rotate.
* @pre
* @p r > 0 && @p r < 32
* @note
* @p x and @p r may be evaluated multiple times.
* @return The rotated result.
*/
#if !defined(NO_CLANG_BUILTIN) && MMH_HAS_BUILTIN(__builtin_rotateleft32) && \
MMH_HAS_BUILTIN(__builtin_rotateleft64)
#define MMH_rotl32 __builtin_rotateleft32
/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
#elif defined(_MSC_VER)
#define MMH_rotl32(x, r) _rotl(x, r)
#else
#define MMH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
#endif
class MurmurHashUtils {
public:
static constexpr int32_t DEFAULT_SEED = 42;
MurmurHashUtils() = delete;
/// Hash unsafe bytes, length must be aligned to 4 bytes.
///
/// @param base base unsafe object
/// @param offset offset for unsafe object
/// @param length_in_bytes length in bytes
/// @return hash code
static int32_t HashUnsafeBytesByWords(const void* base, int64_t offset,
int32_t length_in_bytes) {
return HashUnsafeBytesByWords(base, offset, length_in_bytes, DEFAULT_SEED);
}
/// Hash bytes.
static int32_t HashBytesPositive(const std::shared_ptr<Bytes>& bytes) {
return HashBytes(bytes) & 0x7fffffff;
}
/// Hash bytes.
static int32_t HashBytes(const std::shared_ptr<Bytes>& bytes) {
return HashUnsafeBytes(reinterpret_cast<const void*>(bytes->data()), 0, bytes->size(),
DEFAULT_SEED);
}
static int32_t HashUnsafeBytes(const void* base, int64_t offset, int32_t length_in_bytes,
int32_t seed) {
assert(length_in_bytes >= 0);
int32_t length_aligned = length_in_bytes - length_in_bytes % 4;
int32_t h1 = HashUnsafeBytesByInt(base, offset, length_aligned, seed);
for (int32_t i = length_aligned; i < length_in_bytes; i++) {
int32_t half_word = GetByte(base, offset + i);
int32_t k1 = MixK1(half_word);
h1 = MixH1(h1, k1);
}
return Fmix(h1, length_in_bytes);
}
/// Hash unsafe bytes.
///
/// @param base base unsafe object
/// @param offset offset for unsafe object
/// @param length_in_bytes length in bytes
/// @return hash code
static int32_t HashUnsafeBytes(const void* base, int64_t offset, int32_t length_in_bytes) {
return HashUnsafeBytes(base, offset, length_in_bytes, DEFAULT_SEED);
}
/// Hash bytes in MemorySegment, length must be aligned to 4 bytes.
///
/// @param segment segment.
/// @param offset offset for MemorySegment
/// @param length_in_bytes length in MemorySegment
/// @return hash code
static int32_t HashBytesByWords(const MemorySegment& segment, int32_t offset,
int32_t length_in_bytes) {
return HashBytesByWords(segment, offset, length_in_bytes, DEFAULT_SEED);
}
/// Hash bytes in MemorySegment.
///
/// @param segment segment.
/// @param offset offset for MemorySegment
/// @param length_in_bytes length in MemorySegment
/// @return hash code
static int32_t HashBytes(const MemorySegment& segment, int32_t offset,
int32_t length_in_bytes) {
return HashBytes(segment, offset, length_in_bytes, DEFAULT_SEED);
}
private:
static int32_t HashUnsafeBytesByWords(const void* base, int64_t offset, int32_t length_in_bytes,
int32_t seed) {
int32_t h1 = HashUnsafeBytesByInt(base, offset, length_in_bytes, seed);
return Fmix(h1, length_in_bytes);
}
static int32_t HashBytesByWords(const MemorySegment& segment, int32_t offset,
int32_t length_in_bytes, int32_t seed) {
int32_t h1 = HashBytesByInt(segment, offset, length_in_bytes, seed);
return Fmix(h1, length_in_bytes);
}
static int32_t HashBytes(const MemorySegment& segment, int32_t offset, int32_t length_in_bytes,
int32_t seed) {
int32_t length_aligned = length_in_bytes - length_in_bytes % 4;
int32_t h1 = HashBytesByInt(segment, offset, length_aligned, seed);
for (int32_t i = length_aligned; i < length_in_bytes; i++) {
int32_t k1 = MixK1(segment.Get(offset + i));
h1 = MixH1(h1, k1);
}
return Fmix(h1, length_in_bytes);
}
static int32_t HashUnsafeBytesByInt(const void* base, int64_t offset, int32_t length_in_bytes,
int32_t seed) {
assert(length_in_bytes % 4 == 0);
int32_t h1 = seed;
for (int32_t i = 0; i < length_in_bytes; i += 4) {
int32_t half_word = GetInt(base, offset + i);
int32_t k1 = MixK1(half_word);
h1 = MixH1(h1, k1);
}
return h1;
}
static int32_t HashBytesByInt(const MemorySegment& segment, int32_t offset,
int32_t length_in_bytes, int32_t seed) {
assert(length_in_bytes % 4 == 0);
int32_t h1 = seed;
for (int32_t i = 0; i < length_in_bytes; i += 4) {
auto half_word = segment.GetValue<int32_t>(offset + i);
int32_t k1 = MixK1(half_word);
h1 = MixH1(h1, k1);
}
return h1;
}
static int32_t MixK1(uint32_t k1) {
k1 *= C1;
k1 = MMH_rotl32(k1, 15);
k1 *= C2;
return k1;
}
static int32_t MixH1(uint32_t h1, uint32_t k1) {
h1 ^= k1;
h1 = MMH_rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
return h1;
}
// Finalization mix - force all bits of a hash block to avalanche
static int32_t Fmix(uint32_t h1, uint32_t length) {
h1 ^= length;
return Fmix(h1);
}
static int32_t GetInt(const void* base, int64_t offset) {
int32_t value;
std::memcpy(&value, static_cast<const char*>(base) + offset, sizeof(int32_t));
return value;
}
static char GetByte(const void* base, int64_t offset) {
char value;
std::memcpy(&value, static_cast<const char*>(base) + offset, sizeof(char));
return value;
}
public:
static int32_t Fmix(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
private:
static constexpr int32_t C1 = 0xcc9e2d51;
static constexpr int32_t C2 = 0x1b873593;
};
} // namespace paimon