blob: 598881d91da59a7c18474cdffb6a65e34d7a555a [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_UTIL_GROUP_VARINT_INL_H
#define KUDU_UTIL_GROUP_VARINT_INL_H
#ifdef __linux__
#include <endian.h>
#endif
#ifdef __aarch64__
#include "kudu/util/sse2neon.h"
#else
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#endif //__aarch64__
#include <cstdint>
#include <cstring>
#include <boost/utility/binary.hpp>
#include <boost/parameter/name.hpp>
#include <boost/preprocessor/arithmetic/dec.hpp>
#include <boost/preprocessor/arithmetic/inc.hpp>
#include <boost/preprocessor/control/iif.hpp>
#include <boost/preprocessor/control/while.hpp>
#include <boost/preprocessor/list/fold_left.hpp>
#include <boost/preprocessor/logical/bitand.hpp>
#include <boost/preprocessor/logical/bool.hpp>
#include <boost/preprocessor/logical/compl.hpp>
#include <boost/preprocessor/seq/elem.hpp>
#include <boost/preprocessor/seq/fold_left.hpp>
#include <boost/preprocessor/seq/size.hpp>
#include <boost/preprocessor/tuple/elem.hpp>
#include <boost/preprocessor/variadic/elem.hpp>
#include <glog/logging.h>
#ifndef __linux__
#include "kudu/gutil/port.h"
#endif
#include "kudu/util/faststring.h"
namespace kudu {
namespace coding {
extern bool SSE_TABLE_INITTED;
extern uint8_t SSE_TABLE[256 * 16] __attribute__((aligned(16)));
extern uint8_t VARINT_SELECTOR_LENGTHS[256];
const uint32_t MASKS[4] = { 0xff, 0xffff, 0xffffff, 0xffffffff };
// Calculate the number of bytes to encode the given unsigned int.
inline size_t CalcRequiredBytes32(uint32_t i) {
// | 1 because the result is undefined for the 0 case
return sizeof(uint32_t) - __builtin_clz(i|1)/8;
}
// Decode a set of 4 group-varint encoded integers from the given pointer.
//
// Requires that there are at up to 3 extra bytes remaining in 'src' after
// the last integer.
//
// Returns a pointer following the last decoded integer.
inline const uint8_t *DecodeGroupVarInt32(
const uint8_t *src,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
uint8_t a_sel = (*src & BOOST_BINARY(11 00 00 00)) >> 6;
uint8_t b_sel = (*src & BOOST_BINARY(00 11 00 00)) >> 4;
uint8_t c_sel = (*src & BOOST_BINARY(00 00 11 00)) >> 2;
uint8_t d_sel = (*src & BOOST_BINARY(00 00 00 11 ));
src++; // skip past selector byte
*a = *reinterpret_cast<const uint32_t *>(src) & MASKS[a_sel];
src += a_sel + 1;
*b = *reinterpret_cast<const uint32_t *>(src) & MASKS[b_sel];
src += b_sel + 1;
*c = *reinterpret_cast<const uint32_t *>(src) & MASKS[c_sel];
src += c_sel + 1;
*d = *reinterpret_cast<const uint32_t *>(src) & MASKS[d_sel];
src += d_sel + 1;
return src;
}
// Decode total length of the encoded integers from the given pointer,
// include the tag byte.
inline size_t DecodeGroupVarInt32_GetGroupSize(const uint8_t *src) {
return VARINT_SELECTOR_LENGTHS[*src] + 1;
}
// Decode a set of 4 group-varint encoded integers from the given pointer.
//
// Returns a pointer following the last decoded integer.
inline const uint8_t *DecodeGroupVarInt32_SlowButSafe(
const uint8_t *src,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
// VARINT_SELECTOR_LENGTHS[] isn't initialized until SSE_TABLE_INITTED is true
DCHECK(SSE_TABLE_INITTED);
const size_t total_len = DecodeGroupVarInt32_GetGroupSize(src);
uint8_t safe_buf[17];
memcpy(safe_buf, src, total_len);
DecodeGroupVarInt32(safe_buf, a, b, c, d);
return src + total_len;
}
#ifndef __aarch64__
inline void DoExtractM128(__m128i results,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
#define SSE_USE_EXTRACT_PS
#ifdef SSE_USE_EXTRACT_PS
// _mm_extract_ps turns into extractps, which is slightly faster
// than _mm_extract_epi32 (which turns into pextrd)
// Apparently pextrd involves one more micro-op
// than extractps.
//
// A uint32 cfile macro-benchmark is about 3% faster with this code path.
*a = _mm_extract_ps((__v4sf)results, 0);
*b = _mm_extract_ps((__v4sf)results, 1);
*c = _mm_extract_ps((__v4sf)results, 2);
*d = _mm_extract_ps((__v4sf)results, 3);
#else
*a = _mm_extract_epi32(results, 0);
*b = _mm_extract_epi32(results, 1);
*c = _mm_extract_epi32(results, 2);
*d = _mm_extract_epi32(results, 3);
#endif
}
// Same as above, but uses SSE so may be faster.
// TODO: remove this and just automatically pick the right implementation at runtime.
//
// NOTE: the src buffer must be have at least 17 bytes remaining in it, so this
// code path is not usable at the end of a block.
inline const uint8_t *DecodeGroupVarInt32_SSE(
const uint8_t *src,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
DCHECK(SSE_TABLE_INITTED);
uint8_t sel_byte = *src++;
__m128i shuffle_mask = _mm_load_si128(
reinterpret_cast<__m128i *>(&SSE_TABLE[sel_byte * 16]));
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i results = _mm_shuffle_epi8(data, shuffle_mask);
// It would look like the following would be most efficient,
// since it turns into a single movdqa instruction:
// *reinterpret_cast<__m128i *>(ret) = results;
// (where ret is an aligned array of ints, which the user must pass)
// but it is actually slower than the below alternatives by a
// good amount -- even though these result in more instructions.
DoExtractM128(results, a, b, c, d);
src += VARINT_SELECTOR_LENGTHS[sel_byte];
return src;
}
// Optimized function which decodes a group of uint32s from 'src' into 'ret',
// which should have enough space for 4 uint32s. During decoding, adds 'add'
// to the vector in parallel.
//
// NOTE: the src buffer must be have at least 17 bytes remaining in it, so this
// code path is not usable at the end of a block.
inline const uint8_t *DecodeGroupVarInt32_SSE_Add(
const uint8_t *src,
uint32_t *ret,
__m128i add) {
DCHECK(SSE_TABLE_INITTED);
uint8_t sel_byte = *src++;
__m128i shuffle_mask = _mm_load_si128(
reinterpret_cast<__m128i *>(&SSE_TABLE[sel_byte * 16]));
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i decoded_deltas = _mm_shuffle_epi8(data, shuffle_mask);
__m128i results = _mm_add_epi32(decoded_deltas, add);
DoExtractM128(results, &ret[0], &ret[1], &ret[2], &ret[3]);
src += VARINT_SELECTOR_LENGTHS[sel_byte];
return src;
}
#endif //__aarch64__
// Append a set of group-varint encoded integers to the given faststring.
inline void AppendGroupVarInt32(
faststring *s,
uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
uint8_t a_tag = CalcRequiredBytes32(a) - 1;
uint8_t b_tag = CalcRequiredBytes32(b) - 1;
uint8_t c_tag = CalcRequiredBytes32(c) - 1;
uint8_t d_tag = CalcRequiredBytes32(d) - 1;
uint8_t prefix_byte =
(a_tag << 6) |
(b_tag << 4) |
(c_tag << 2) |
(d_tag);
uint8_t size = 1 +
a_tag + 1 +
b_tag + 1 +
c_tag + 1 +
d_tag + 1;
size_t old_size = s->size();
// Reserving 4 extra bytes means we can use simple
// 4-byte stores instead of variable copies here --
// if we hang off the end of the array into the "empty" area, it's OK.
// We'll chop it back off down below.
s->resize(old_size + size + 4);
uint8_t *ptr = &((*s)[old_size]);
#if __BYTE_ORDER != __LITTLE_ENDIAN
#error dont support big endian currently
#endif
*ptr++ = prefix_byte;
memcpy(ptr, &a, 4);
ptr += a_tag + 1;
memcpy(ptr, &b, 4);
ptr += b_tag + 1;
memcpy(ptr, &c, 4);
ptr += c_tag + 1;
memcpy(ptr, &d, 4);
s->resize(old_size + size);
}
// Append a sequence of uint32s encoded using group-varint.
//
// 'frame_of_reference' is also subtracted from each integer
// before encoding.
//
// If frame_of_reference is greater than any element in the array,
// results are undefined.
//
// For best performance, users should already have reserved adequate
// space in 's' (CalcRequiredBytes32 can be handy here)
inline void AppendGroupVarInt32Sequence(faststring *s, uint32_t frame_of_reference,
uint32_t *ints, size_t size) {
uint32_t *p = ints;
while (size >= 4) {
AppendGroupVarInt32(s,
p[0] - frame_of_reference,
p[1] - frame_of_reference,
p[2] - frame_of_reference,
p[3] - frame_of_reference);
size -= 4;
p += 4;
}
uint32_t trailer[4] = {0, 0, 0, 0};
uint32_t *trailer_p = &trailer[0];
if (size > 0) {
while (size > 0) {
*trailer_p++ = *p++ - frame_of_reference;
size--;
}
AppendGroupVarInt32(s, trailer[0], trailer[1], trailer[2], trailer[3]);
}
}
} // namespace coding
} // namespace kudu
#endif