blob: f7849da41a01400d075d036e4d0a884c9895a871 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::data_type::AsBytes;
/// Computes hash value for `data`, with a seed value `seed`.
/// The data type `T` must implement the `AsBytes` trait.
pub fn hash<T: AsBytes>(data: &T, seed: u32) -> u32 {
hash_(data.as_bytes(), seed)
}
fn hash_(data: &[u8], seed: u32) -> u32 {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe {
if is_x86_feature_detected!("sse4.2") {
crc32_hash(data, seed)
} else {
murmur_hash2_64a(data, seed as u64) as u32
}
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
unsafe {
murmur_hash2_64a(data, seed as u64) as u32
}
}
const MURMUR_PRIME: u64 = 0xc6a4a7935bd1e995;
const MURMUR_R: i32 = 47;
/// Rust implementation of MurmurHash2, 64-bit version for 64-bit platforms
///
/// SAFTETY Only safe on platforms which support unaligned loads (like x86_64)
unsafe fn murmur_hash2_64a(data_bytes: &[u8], seed: u64) -> u64 {
let len = data_bytes.len();
let len_64 = (len / 8) * 8;
let data_bytes_64 = std::slice::from_raw_parts(
&data_bytes[0..len_64] as *const [u8] as *const u64,
len / 8,
);
let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64));
for v in data_bytes_64 {
let mut k = *v;
k = k.wrapping_mul(MURMUR_PRIME);
k ^= k >> MURMUR_R;
k = k.wrapping_mul(MURMUR_PRIME);
h ^= k;
h = h.wrapping_mul(MURMUR_PRIME);
}
let data2 = &data_bytes[len_64..];
let v = len & 7;
if v == 7 {
h ^= (data2[6] as u64) << 48;
}
if v >= 6 {
h ^= (data2[5] as u64) << 40;
}
if v >= 5 {
h ^= (data2[4] as u64) << 32;
}
if v >= 4 {
h ^= (data2[3] as u64) << 24;
}
if v >= 3 {
h ^= (data2[2] as u64) << 16;
}
if v >= 2 {
h ^= (data2[1] as u64) << 8;
}
if v >= 1 {
h ^= data2[0] as u64;
}
if v > 0 {
h = h.wrapping_mul(MURMUR_PRIME);
}
h ^= h >> MURMUR_R;
h = h.wrapping_mul(MURMUR_PRIME);
h ^= h >> MURMUR_R;
h
}
/// CRC32 hash implementation using SSE4 instructions. Borrowed from Impala.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.2")]
unsafe fn crc32_hash(bytes: &[u8], seed: u32) -> u32 {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let u32_num_bytes = std::mem::size_of::<u32>();
let mut num_bytes = bytes.len();
let num_words = num_bytes / u32_num_bytes;
num_bytes %= u32_num_bytes;
let bytes_u32: &[u32] = std::slice::from_raw_parts(
&bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32,
num_words,
);
let mut offset = 0;
let mut hash = seed;
while offset < num_words {
hash = _mm_crc32_u32(hash, bytes_u32[offset]);
offset += 1;
}
offset = num_words * u32_num_bytes;
while offset < num_bytes {
hash = _mm_crc32_u8(hash, bytes[offset]);
offset += 1;
}
// The lower half of the CRC hash has poor uniformity, so swap the halves
// for anyone who only uses the first several bits of the hash.
hash = (hash << 16) | (hash >> 16);
hash
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_murmur2_64a() {
unsafe {
let result = murmur_hash2_64a(b"hello", 123);
assert_eq!(result, 2597646618390559622);
let result = murmur_hash2_64a(b"helloworld", 123);
assert_eq!(result, 4934371746140206573);
let result = murmur_hash2_64a(b"helloworldparquet", 123);
assert_eq!(result, 2392198230801491746);
}
}
#[test]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn test_crc32() {
if is_x86_feature_detected!("sse4.2") {
unsafe {
let result = crc32_hash(b"hello", 123);
assert_eq!(result, 2927487359);
let result = crc32_hash(b"helloworld", 123);
assert_eq!(result, 314229527);
let result = crc32_hash(b"helloworldparquet", 123);
assert_eq!(result, 667078870);
}
}
}
}