blob: 9bf28358dc6e97aca29fe0e2dc3078ef19d89ac1 [file] [log] [blame]
// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
//
// Copyright (c) 2020 Zach Wegner
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// Based on https://github.com/zwegner/zp7 as of
// a1ed4e5ace07f7d69cb50af1cbd37df4fa3d87af
//
// This is imported rather than included via thirdparty since the upstream
// project has no header file. It has been modified as follows:
//
// - remove 'pdep' implementations since we only need 'pext'.
// - separate the clmul and non-accelerated variant into separate
// functions so they can be switched at runtime.
// - put inside a kudu namespace
// - disable UBSAN for some undefined integer casts
#include "kudu/common/zp7.h"
#include "kudu/gutil/port.h"
#ifdef __x86_64__
#include <emmintrin.h>
#elif defined(__aarch64__)
#include "kudu/util/sse2neon.h"
#endif
#ifndef __aarch64__
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
#define USE_INLINE_ASM_CLMUL
#else
#include <wmmintrin.h>
#endif
#endif
#define N_BITS (6)
namespace kudu {
typedef struct {
uint64_t mask;
uint64_t ppp_bit[N_BITS];
} zp7_masks_64_t;
// If we don't have access to the CLMUL instruction, emulate it with
// shifts and XORs
static inline uint64_t prefix_sum(uint64_t x) {
for (int i = 0; i < N_BITS; i++)
x ^= x << (1 << i);
return x;
}
// GCC <5 doesn't properly handle the _pext_u64 intrinsic inside
// a function with a specified target attribute. So, use inline
// assembly instead.
#ifdef USE_INLINE_ASM_CLMUL
static inline __m128i asm_mm_clmulepi64_si128(__m128i a, __m128i b) {
asm ("pclmulqdq $0, %1, %0"
: "+x" (a)
: "xm" (b));
return a;
}
#define CLMUL asm_mm_clmulepi64_si128
#else
#define CLMUL(a, b) (_mm_clmulepi64_si128(a, b, 0))
#endif
// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
// It can also be called separately and cached, if the mask values will be used
// more than once (these can be shared across PEXT and PDEP calls if they use
// the same masks).
//
// This variant depends on the CLMUL instruction.
#ifndef __aarch64__
__attribute__((target("pclmul")))
#endif // __aarch64__
ATTRIBUTE_NO_SANITIZE_INTEGER
static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
zp7_masks_64_t r;
r.mask = mask;
// Count *unset* bits
mask = ~mask;
// Move the mask and -2 to XMM registers for CLMUL
__m128i m = _mm_cvtsi64_si128(mask);
__m128i neg_2 = _mm_cvtsi64_si128(-2LL);
for (int i = 0; i < N_BITS - 1; i++) {
// Do a 1-bit parallel prefix popcount, shifted left by 1,
// in one carry-less multiply by -2.
__m128i bit = CLMUL(m, neg_2);
r.ppp_bit[i] = _mm_cvtsi128_si64(bit);
// Get the carry bit of the 1-bit parallel prefix popcount. On
// the next iteration, we will sum this bit to get the next mask
m = _mm_and_si128(m, bit);
}
// For the last iteration, we can use a regular multiply by -2 instead of a
// carry-less one (or rather, a strength reduction of that, with
// neg/add/etc), since there can't be any carries anyways. That is because
// the last value of m (which has one bit set for every 32nd unset mask bit)
// has at most two bits set in it, when mask is zero and thus there are 64
// bits set in ~mask. If two bits are set, one of them is the top bit, which
// gets shifted out, since we're counting bits below each mask bit.
r.ppp_bit[N_BITS - 1] = -_mm_cvtsi128_si64(m) << 1;
return r;
}
// Implementation that doesn't depend on CLMUL
ATTRIBUTE_NO_SANITIZE_INTEGER
static zp7_masks_64_t zp7_ppp_64_simple(uint64_t mask) {
zp7_masks_64_t r;
r.mask = mask;
// Count *unset* bits
mask = ~mask;
for (int i = 0; i < N_BITS - 1; i++) {
// Do a 1-bit parallel prefix popcount, shifted left by 1
uint64_t bit = prefix_sum(mask << 1);
r.ppp_bit[i] = bit;
// Get the carry bit of the 1-bit parallel prefix popcount. On
// the next iteration, we will sum this bit to get the next mask
mask &= bit;
}
// The last iteration won't carry, so just use neg/shift. See the CLMUL
// case above for justification.
r.ppp_bit[N_BITS - 1] = -mask << 1;
return r;
}
// PEXT
static uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
// Mask only the bits that are set in the input mask. Otherwise they collide
// with input bits and screw everything up
a &= masks->mask;
// For each bit in the PPP, shift right only those bits that are set in
// that bit's mask
for (int i = 0; i < N_BITS; i++) {
uint64_t shift = 1 << i;
uint64_t bit = masks->ppp_bit[i];
// Shift only the input bits that are set in
a = (a & ~bit) | ((a & bit) >> shift);
}
return a;
}
uint64_t zp7_pext_64_simple(uint64_t a, uint64_t mask) {
zp7_masks_64_t masks = zp7_ppp_64_simple(mask);
return zp7_pext_pre_64(a, &masks);
}
uint64_t zp7_pext_64_clmul(uint64_t a, uint64_t mask) {
zp7_masks_64_t masks = zp7_ppp_64_clmul(mask);
return zp7_pext_pre_64(a, &masks);
}
} // namespace kudu