blob: 653c1714b2e3c6a8dcceeddd466898e260aaa2a2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
// functions needed for parquet-cpp
#ifndef PARQUET_UTIL_SSE_UTIL_H
#define PARQUET_UTIL_SSE_UTIL_H
#ifdef PARQUET_USE_SSE
#include <emmintrin.h>
#endif
namespace parquet {
/// This class contains constants useful for text processing with SSE4.2 intrinsics.
namespace SSEUtil {
/// Number of characters that fit in 64/128 bit register. SSE provides instructions
/// for loading 64 or 128 bits into a register at a time.
static const int CHARS_PER_64_BIT_REGISTER = 8;
static const int CHARS_PER_128_BIT_REGISTER = 16;
/// SSE4.2 adds instructions for text processing. The instructions have a control
/// byte that determines some of functionality of the instruction. (Equivalent to
/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr
static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp
static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16)
static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
/// In this mode, SSE text processing functions will return a mask of all the
/// characters that matched.
static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
/// In this mode, SSE text processing functions will return the number of
/// bytes that match consecutively from the beginning.
static const int STRCMP_MODE =
PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY;
/// Precomputed mask values up to 16 bits.
static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9,
1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
};
} // namespace SSEUtil
#ifdef PARQUET_USE_SSE
/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen
/// IR load time) that the processor supports SSE 4.2 before calling these. These are
/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
#ifndef IR_COMPILE
/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
/// flag. Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
/// (IMPALA-1399/1646). The compiler intrinsics cannot be used without -msse4.2, so we
/// define our own implementations of the intrinsics instead.
/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
/// immediate. So, those need to be always inlined in order to always propagate the
/// mode constant into the inline asm.
#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__))
template <int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
#ifdef __clang__
/// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
/// clang doesn't support Y-prefixed asm constraints.
register volatile __m128i result asm("xmm0");
__asm__ volatile("pcmpestrm %5, %2, %1"
: "=x"(result)
: "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
: "cc");
#else
__m128i result;
__asm__ volatile("pcmpestrm %5, %2, %1"
: "=Yz"(result)
: "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
: "cc");
#endif
return result;
}
template <int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
int result;
__asm__("pcmpestri %5, %2, %1"
: "=c"(result)
: "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
: "cc");
return result;
}
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
__asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
__asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
__asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
uint64_t result = crc;
__asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
return result;
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
int64_t result;
__asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
return result;
}
#undef SSE_ALWAYS_INLINE
#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2.
/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
/// support it. However, the cross-compiled IR is compiled twice: with and without
/// -msse4.2. When -msse4.2 is enabled in the cross-compile, we can just use the
/// compiler intrinsics.
#include <smmintrin.h>
template <int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestrm(str1, len1, str2, len2, MODE);
}
template <int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestri(str1, len1, str2, len2, MODE);
}
#define SSE4_crc32_u8 _mm_crc32_u8
#define SSE4_crc32_u16 _mm_crc32_u16
#define SSE4_crc32_u32 _mm_crc32_u32
#define SSE4_crc32_u64 _mm_crc32_u64
#define POPCNT_popcnt_u64 _mm_popcnt_u64
#else // IR_COMPILE without SSE 4.2.
/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
/// SSE 4.2 instructions. Otherwise, the IR loading will fail on CPUs that don't
/// support SSE 4.2. However, because the caller isn't allowed to call these routines
/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
template <int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return (__m128i){0}; // NOLINT
}
template <int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
#endif // IR_COMPILE
#else
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
DCHECK(false) << "SSE support is not enabled";
return 0;
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
DCHECK(false) << "SSE support is not enabled";
return 0;
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
DCHECK(false) << "SSE support is not enabled";
return 0;
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
DCHECK(false) << "SSE support is not enabled";
return 0;
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
DCHECK(false) << "SSE support is not enabled";
return 0;
}
#endif // PARQUET_USE_SSE
} // namespace parquet
#endif // PARQUET_UTIL_SSE_UTIL_H