blob: 88377182571aa9c6a80a048244bf32133ffac4cf [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_UTIL_SSE_UTIL_H
#define IMPALA_UTIL_SSE_UTIL_H
#include <emmintrin.h>
#if defined(IR_COMPILE) && defined(__SSE4_2__) // IR_COMPILE for SSE 4.2.
#include <smmintrin.h>
#endif
namespace impala {
/// This class contains constants useful for text processing with SSE4.2 intrinsics.
namespace SSEUtil {
/// Number of characters that fit in 64/128 bit register. SSE provides instructions
/// for loading 64 or 128 bits into a register at a time.
static const int CHARS_PER_64_BIT_REGISTER = 8;
static const int CHARS_PER_128_BIT_REGISTER = 16;
/// SSE4.2 adds instructions for text processing. The instructions have a control
/// byte that determines some of functionality of the instruction. (Equivalent to
/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr
static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp
static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16)
static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
/// In this mode, SSE text processing functions will return a mask of all the
/// characters that matched.
static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
/// In this mode, SSE text processing functions will return the number of bytes that match
/// consecutively from the beginning.
static const int STRCMP_MODE = PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS |
PCMPSTR_NEG_POLARITY;
/// Precomputed mask values up to 16 bits.
static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
1 << 0,
1 << 1,
1 << 2,
1 << 3,
1 << 4,
1 << 5,
1 << 6,
1 << 7,
1 << 8,
1 << 9,
1 << 10,
1 << 11,
1 << 12,
1 << 13,
1 << 14,
1 << 15,
};
}
/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen
/// IR load time) that the processor supports SSE 4.2 before calling these. All __asm__
/// blocks are marked __volatile__ to prevent hoisting the ASM out of checks for CPU
/// support (e.g. IMPALA-6882).
///
/// These intrinsics are defined outside the namespace because the IR w/ SSE 4.2 case
/// needs to use macros.
#ifndef IR_COMPILE
/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
/// flag. Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
/// (IMPALA-1399/1646). The compiler intrinsics cannot be used without -msse4.2, so we
/// define our own implementations of the intrinsics instead.
#if defined(__SSE4_1__) || defined(__POPCNT__)
/// Impala native code should not be compiled with -msse4.1 or higher until the minimum
/// CPU requirement is raised to at least the targeted instruction set.
#error "Do not compile with -msse4.1 or higher."
#endif
/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
/// immediate. So, those need to be always inlined in order to always propagate the
/// mode constant into the inline asm.
#define SSE_ALWAYS_INLINE inline __attribute__ ((__always_inline__))
template<int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
#ifdef __clang__
/// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
/// clang doesn't support Y-prefixed asm constraints.
register volatile __m128i result asm ("xmm0");
__asm__ __volatile__ ("pcmpestrm %5, %2, %1"
: "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
#else
__m128i result;
__asm__ __volatile__ ("pcmpestrm %5, %2, %1"
: "=Yz"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
#endif
return result;
}
template<int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
int result;
__asm__ __volatile__("pcmpestri %5, %2, %1"
: "=c"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
return result;
}
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
__asm__ __volatile__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
__asm__ __volatile__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
__asm__ __volatile__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
return crc;
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
uint64_t result = crc;
__asm__ __volatile__("crc32q %1, %0" : "+r"(result) : "rm"(v));
return result;
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
int64_t result;
__asm__ __volatile__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
return result;
}
#undef SSE_ALWAYS_INLINE
#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2.
/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
/// support it. However, the cross-compiled IR is compiled twice: with and without
/// -msse4.2. When -msse4.2 is enabled in the cross-compile, we can just use the
/// compiler intrinsics.
template<int MODE>
static inline __m128i SSE4_cmpestrm(
__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestrm(str1, len1, str2, len2, MODE);
}
template<int MODE>
static inline int SSE4_cmpestri(
__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestri(str1, len1, str2, len2, MODE);
}
#define SSE4_crc32_u8 _mm_crc32_u8
#define SSE4_crc32_u16 _mm_crc32_u16
#define SSE4_crc32_u32 _mm_crc32_u32
#define SSE4_crc32_u64 _mm_crc32_u64
#define POPCNT_popcnt_u64 _mm_popcnt_u64
#else // IR_COMPILE without SSE 4.2.
/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
/// SSE 4.2 instructions. Otherwise, the IR loading will fail on CPUs that don't
/// support SSE 4.2. However, because the caller isn't allowed to call these routines
/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
template<int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return (__m128i) { 0 };
}
template<int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
DCHECK(false) << "CPU doesn't support SSE 4.2";
return 0;
}
#endif
}
#endif