blob: eacf3623087a0c110f6cd6a9d465f8d77e7514e1 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
extern "C" {
#include <string.h>
#include "./types.h"
static inline gdv_uint64 rotate_left(gdv_uint64 val, int distance) {
return (val << distance) | (val >> (64 - distance));
}
//
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain.
// See http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp
// MurmurHash3_x64_128
//
static inline gdv_uint64 fmix64(gdv_uint64 k) {
k ^= k >> 33;
k *= 0xff51afd7ed558ccduLL;
k ^= k >> 33;
k *= 0xc4ceb9fe1a85ec53uLL;
k ^= k >> 33;
return k;
}
static inline gdv_uint64 murmur3_64(gdv_uint64 val, gdv_int32 seed) {
gdv_uint64 h1 = seed;
gdv_uint64 h2 = seed;
gdv_uint64 c1 = 0x87c37b91114253d5ull;
gdv_uint64 c2 = 0x4cf5ad432745937full;
int length = 8;
gdv_uint64 k1 = 0;
k1 = val;
k1 *= c1;
k1 = rotate_left(k1, 31);
k1 *= c2;
h1 ^= k1;
h1 ^= length;
h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
// h2 += h1;
// murmur3_128 should return 128 bit (h1,h2), now we return only 64bits,
return h1;
}
static inline gdv_uint32 murmur3_32(gdv_uint64 val, gdv_int32 seed) {
gdv_uint64 c1 = 0xcc9e2d51ull;
gdv_uint64 c2 = 0x1b873593ull;
int length = 8;
static gdv_uint64 UINT_MASK = 0xffffffffull;
gdv_uint64 lh1 = seed & UINT_MASK;
for (int i = 0; i < 2; i++) {
gdv_uint64 lk1 = ((val >> i * 32) & UINT_MASK);
lk1 *= c1;
lk1 &= UINT_MASK;
lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
lk1 *= c2;
lk1 &= UINT_MASK;
lh1 ^= lk1;
lh1 = ((lh1 << 13) & UINT_MASK) | (lh1 >> 19);
lh1 = lh1 * 5 + 0xe6546b64L;
lh1 = UINT_MASK & lh1;
}
lh1 ^= length;
lh1 ^= lh1 >> 16;
lh1 *= 0x85ebca6bull;
lh1 = UINT_MASK & lh1;
lh1 ^= lh1 >> 13;
lh1 *= 0xc2b2ae35ull;
lh1 = UINT_MASK & lh1;
lh1 ^= lh1 >> 16;
return static_cast<gdv_uint32>(lh1);
}
static inline gdv_uint64 double_to_long_bits(double value) {
gdv_uint64 result;
memcpy(&result, &value, sizeof(result));
return result;
}
FORCE_INLINE gdv_int64 hash64(double val, gdv_int64 seed) {
return murmur3_64(double_to_long_bits(val), static_cast<gdv_int32>(seed));
}
FORCE_INLINE gdv_int32 hash32(double val, gdv_int32 seed) {
return murmur3_32(double_to_long_bits(val), seed);
}
// Wrappers for all the numeric/data/time arrow types
#define HASH64_WITH_SEED_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid, gdv_int64 seed, \
gdv_boolean seed_isvalid) { \
if (!is_valid) { \
return seed; \
} \
return hash64(static_cast<double>(in), seed); \
}
#define HASH32_WITH_SEED_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid, gdv_int32 seed, \
gdv_boolean seed_isvalid) { \
if (!is_valid) { \
return seed; \
} \
return hash32(static_cast<double>(in), seed); \
}
#define HASH64_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid) { \
return is_valid ? hash64(static_cast<double>(in), 0) : 0; \
}
#define HASH32_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_boolean is_valid) { \
return is_valid ? hash32(static_cast<double>(in), 0) : 0; \
}
// Expand inner macro for all numeric types.
#define NUMERIC_BOOL_DATE_TYPES(INNER, NAME) \
INNER(NAME, int8) \
INNER(NAME, int16) \
INNER(NAME, int32) \
INNER(NAME, int64) \
INNER(NAME, uint8) \
INNER(NAME, uint16) \
INNER(NAME, uint32) \
INNER(NAME, uint64) \
INNER(NAME, float32) \
INNER(NAME, float64) \
INNER(NAME, boolean) \
INNER(NAME, date64) \
INNER(NAME, date32) \
INNER(NAME, time32) \
INNER(NAME, timestamp)
NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash)
NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32)
NUMERIC_BOOL_DATE_TYPES(HASH32_OP, hash32AsDouble)
NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32WithSeed)
NUMERIC_BOOL_DATE_TYPES(HASH32_WITH_SEED_OP, hash32AsDoubleWithSeed)
NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64)
NUMERIC_BOOL_DATE_TYPES(HASH64_OP, hash64AsDouble)
NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64WithSeed)
NUMERIC_BOOL_DATE_TYPES(HASH64_WITH_SEED_OP, hash64AsDoubleWithSeed)
#undef NUMERIC_BOOL_DATE_TYPES
static inline gdv_uint64 murmur3_64_buf(const gdv_uint8* key, gdv_int32 len,
gdv_int32 seed) {
gdv_uint64 h1 = seed;
gdv_uint64 h2 = seed;
gdv_uint64 c1 = 0x87c37b91114253d5ull;
gdv_uint64 c2 = 0x4cf5ad432745937full;
const gdv_uint64* blocks = reinterpret_cast<const gdv_uint64*>(key);
int nblocks = len / 16;
for (int i = 0; i < nblocks; i++) {
gdv_uint64 k1 = blocks[i * 2 + 0];
gdv_uint64 k2 = blocks[i * 2 + 1];
k1 *= c1;
k1 = rotate_left(k1, 31);
k1 *= c2;
h1 ^= k1;
h1 = rotate_left(h1, 27);
h1 += h2;
h1 = h1 * 5 + 0x52dce729;
k2 *= c2;
k2 = rotate_left(k2, 33);
k2 *= c1;
h2 ^= k2;
h2 = rotate_left(h2, 31);
h2 += h1;
h2 = h2 * 5 + 0x38495ab5;
}
// tail
gdv_uint64 k1 = 0;
gdv_uint64 k2 = 0;
const gdv_uint8* tail = reinterpret_cast<const gdv_uint8*>(key + nblocks * 16);
switch (len & 15) {
case 15:
k2 = static_cast<gdv_uint64>(tail[14]) << 48;
case 14:
k2 ^= static_cast<gdv_uint64>(tail[13]) << 40;
case 13:
k2 ^= static_cast<gdv_uint64>(tail[12]) << 32;
case 12:
k2 ^= static_cast<gdv_uint64>(tail[11]) << 24;
case 11:
k2 ^= static_cast<gdv_uint64>(tail[10]) << 16;
case 10:
k2 ^= static_cast<gdv_uint64>(tail[9]) << 8;
case 9:
k2 ^= static_cast<gdv_uint64>(tail[8]);
k2 *= c2;
k2 = rotate_left(k2, 33);
k2 *= c1;
h2 ^= k2;
case 8:
k1 ^= static_cast<gdv_uint64>(tail[7]) << 56;
case 7:
k1 ^= static_cast<gdv_uint64>(tail[6]) << 48;
case 6:
k1 ^= static_cast<gdv_uint64>(tail[5]) << 40;
case 5:
k1 ^= static_cast<gdv_uint64>(tail[4]) << 32;
case 4:
k1 ^= static_cast<gdv_uint64>(tail[3]) << 24;
case 3:
k1 ^= static_cast<gdv_uint64>(tail[2]) << 16;
case 2:
k1 ^= static_cast<gdv_uint64>(tail[1]) << 8;
case 1:
k1 ^= static_cast<gdv_uint64>(tail[0]) << 0;
k1 *= c1;
k1 = rotate_left(k1, 31);
k1 *= c2;
h1 ^= k1;
}
h1 ^= len;
h2 ^= len;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
// h2 += h1;
// returning 64-bits of the 128-bit hash.
return h1;
}
static gdv_uint32 murmur3_32_buf(const gdv_uint8* key, gdv_int32 len, gdv_int32 seed) {
static const gdv_uint64 c1 = 0xcc9e2d51ull;
static const gdv_uint64 c2 = 0x1b873593ull;
static const gdv_uint64 UINT_MASK = 0xffffffffull;
gdv_uint64 lh1 = seed;
const gdv_uint32* blocks = reinterpret_cast<const gdv_uint32*>(key);
int nblocks = len / 4;
const gdv_uint8* tail = reinterpret_cast<const gdv_uint8*>(key + nblocks * 4);
for (int i = 0; i < nblocks; i++) {
gdv_uint64 lk1 = static_cast<gdv_uint64>(blocks[i]);
// k1 *= c1;
lk1 *= c1;
lk1 &= UINT_MASK;
lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
lk1 *= c2;
lk1 = lk1 & UINT_MASK;
lh1 ^= lk1;
lh1 = ((lh1 << 13) & UINT_MASK) | (lh1 >> 19);
lh1 = lh1 * 5 + 0xe6546b64ull;
lh1 = UINT_MASK & lh1;
}
// tail
gdv_uint64 lk1 = 0;
switch (len & 3) {
case 3:
lk1 = (tail[2] & 0xff) << 16;
case 2:
lk1 |= (tail[1] & 0xff) << 8;
case 1:
lk1 |= (tail[0] & 0xff);
lk1 *= c1;
lk1 = UINT_MASK & lk1;
lk1 = ((lk1 << 15) & UINT_MASK) | (lk1 >> 17);
lk1 *= c2;
lk1 = lk1 & UINT_MASK;
lh1 ^= lk1;
}
// finalization
lh1 ^= len;
lh1 ^= lh1 >> 16;
lh1 *= 0x85ebca6b;
lh1 = UINT_MASK & lh1;
lh1 ^= lh1 >> 13;
lh1 *= 0xc2b2ae35;
lh1 = UINT_MASK & lh1;
lh1 ^= lh1 >> 16;
return static_cast<gdv_uint32>(lh1 & UINT_MASK);
}
FORCE_INLINE gdv_int64 hash64_buf(const gdv_uint8* buf, int len, gdv_int64 seed) {
return murmur3_64_buf(buf, len, static_cast<gdv_int32>(seed));
}
FORCE_INLINE gdv_int32 hash32_buf(const gdv_uint8* buf, int len, gdv_int32 seed) {
return murmur3_32_buf(buf, len, seed);
}
// Wrappers for the varlen types
#define HASH64_BUF_WITH_SEED_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid, \
gdv_int64 seed, gdv_boolean seed_isvalid) { \
if (!is_valid) { \
return seed; \
} \
return hash64_buf(reinterpret_cast<const uint8_t*>(in), len, seed); \
}
#define HASH32_BUF_WITH_SEED_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid, \
gdv_int32 seed, gdv_boolean seed_isvalid) { \
if (!is_valid) { \
return seed; \
} \
return hash32_buf(reinterpret_cast<const uint8_t*>(in), len, seed); \
}
#define HASH64_BUF_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int64 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
return is_valid ? hash64_buf(reinterpret_cast<const uint8_t*>(in), len, 0) : 0; \
}
#define HASH32_BUF_OP(NAME, TYPE) \
FORCE_INLINE \
gdv_int32 NAME##_##TYPE(gdv_##TYPE in, gdv_int32 len, gdv_boolean is_valid) { \
return is_valid ? hash32_buf(reinterpret_cast<const uint8_t*>(in), len, 0) : 0; \
}
// Expand inner macro for all non-numeric types.
#define VAR_LEN_TYPES(INNER, NAME) \
INNER(NAME, utf8) \
INNER(NAME, binary)
VAR_LEN_TYPES(HASH32_BUF_OP, hash)
VAR_LEN_TYPES(HASH32_BUF_OP, hash32)
VAR_LEN_TYPES(HASH32_BUF_OP, hash32AsDouble)
VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32WithSeed)
VAR_LEN_TYPES(HASH32_BUF_WITH_SEED_OP, hash32AsDoubleWithSeed)
VAR_LEN_TYPES(HASH64_BUF_OP, hash64)
VAR_LEN_TYPES(HASH64_BUF_OP, hash64AsDouble)
VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64WithSeed)
VAR_LEN_TYPES(HASH64_BUF_WITH_SEED_OP, hash64AsDoubleWithSeed)
#undef HASH32_BUF_OP
#undef HASH32_BUF_WITH_SEED_OP
#undef HASH32_OP
#undef HASH32_WITH_SEED_OP
#undef HASH64_BUF_OP
#undef HASH64_BUF_WITH_SEED_OP
#undef HASH64_OP
#undef HASH64_WITH_SEED_OP
} // extern "C"