blob: c8a7318ccf83bc83103fba8e13b9c983638f1ef6 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/bit-util.h"
#include <immintrin.h>
namespace {
// ByteSwapScalarLoop is only used in bit-util.cc, so put it in this anonymous
// namespace
inline static void ByteSwapScalarLoop(const void* src, int len, void* dst) {
//TODO: improve the performance of following code further using BSWAP intrinsic
uint8_t* d = reinterpret_cast<uint8_t*>(dst);
const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
for (int i = 0; i < len; ++i) d[i] = s[len - i - 1];
}
}
namespace impala {
void SimdByteSwap::ByteSwapScalar(const void* source, int len, void* dest) {
uint8_t* dst = reinterpret_cast<uint8_t*>(dest);
const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
switch (len) {
case 1:
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src);
return;
case 2:
*reinterpret_cast<uint16_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src));
return;
case 3:
*reinterpret_cast<uint16_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 2);
return;
case 4:
*reinterpret_cast<uint32_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
return;
case 5:
*reinterpret_cast<uint32_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 4);
return;
case 6:
*reinterpret_cast<uint32_t*>(dst + 2) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
*reinterpret_cast<uint16_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
return;
case 7:
*reinterpret_cast<uint32_t*>(dst + 3) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
*reinterpret_cast<uint16_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 6);
return;
case 8:
*reinterpret_cast<uint64_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
return;
case 9:
*reinterpret_cast<uint64_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 8);
return;
case 10:
*reinterpret_cast<uint64_t*>(dst + 2) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint16_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
return;
case 11:
*reinterpret_cast<uint64_t*>(dst + 3) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint16_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 10);
return;
case 12:
*reinterpret_cast<uint64_t*>(dst + 4) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint32_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
return;
case 13:
*reinterpret_cast<uint64_t*>(dst + 5) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint32_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 12);
return;
case 14:
*reinterpret_cast<uint64_t*>(dst + 6) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint32_t*>(dst + 2) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
*reinterpret_cast<uint16_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
return;
case 15:
*reinterpret_cast<uint64_t*>(dst + 7) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint32_t*>(dst + 3) =
BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
*reinterpret_cast<uint16_t*>(dst + 1) =
BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
*reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 14);
return;
case 16:
*reinterpret_cast<uint64_t*>(dst + 8) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
*reinterpret_cast<uint64_t*>(dst) =
BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src + 8));
return;
default:
// Revert to slow loop-based swap.
ByteSwapScalarLoop(source, len, dest);
return;
}
}
// This constant is concluded from the definition of _mm_set_epi8;
// Refer this link for more details:
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
const __m128i mask128i = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15);
// ByteSwap 16 bytes using SSSE3 instructions.
__attribute__((target("ssse3")))
inline void SimdByteSwap::ByteSwap128(const uint8_t* src, uint8_t* dst) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), _mm_shuffle_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), mask128i));
}
// ByteSwap 32 bytes using AVX2 instructions.
__attribute__((target("avx2")))
inline void SimdByteSwap::ByteSwap256(const uint8_t* src, uint8_t* dst) {
// This constant is concluded from the definition of _mm256_set_epi8;
// Refer this link for more details:
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
const __m256i mask256i = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), _mm256_shuffle_epi8(
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)), mask256i));
const __m128i part1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(dst));
const __m128i part2 = _mm_loadu_si128(reinterpret_cast<__m128i*>(dst + 16));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), part2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), part1);
_mm256_zeroupper();
}
// Internal implementation of ByteSwapSimd
// TEMPLATE_DATA_WIDTH: 16byte or 32byte, corresponding to SSSE3 or AVX2 routine
// SIMD_FUNC: function pointer to ByteSwapSSE_Unit(16byte) or ByteSwapAVX_Unit(32byte)
// dest: the memory address of destination
// source: the memory address of source
// len: the number of bytes of input data
template <int TEMPLATE_DATA_WIDTH>
inline void SimdByteSwap::ByteSwapSimd(const void* source, const int len, void* dest) {
DCHECK(TEMPLATE_DATA_WIDTH == 16 || TEMPLATE_DATA_WIDTH == 32)
<< "Only 16 or 32 are valid for TEMPLATE_DATA_WIDTH now.";
/// Function pointer to SIMD ByteSwap functions
void (*bswap_fptr)(const uint8_t* src, uint8_t* dst) = NULL;
if (TEMPLATE_DATA_WIDTH == 16) {
bswap_fptr = SimdByteSwap::ByteSwap128;
} else if (TEMPLATE_DATA_WIDTH == 32) {
bswap_fptr = SimdByteSwap::ByteSwap256;
}
const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
uint8_t* dst = reinterpret_cast<uint8_t*>(dest);
src += len - TEMPLATE_DATA_WIDTH;
int i = len - TEMPLATE_DATA_WIDTH;
while (true) {
bswap_fptr(src, dst);
dst += TEMPLATE_DATA_WIDTH;
if (i < TEMPLATE_DATA_WIDTH) break;
i -= TEMPLATE_DATA_WIDTH;
src -= TEMPLATE_DATA_WIDTH;
}
if (TEMPLATE_DATA_WIDTH > 16 && i >= 16) {
src -= 16;
SimdByteSwap::ByteSwap128(src, dst);
i -= 16;
dst += 16;
}
// Remaining bytes(<16) are dealt with scalar routine
// TODO: improve the performance of following code further using pshufb intrinsic
src -= i;
SimdByteSwap::ByteSwapScalar(src, i, dst);
}
// Explicit instantiations for ByteSwapSSE_Unit and ByteSwapAVX2_Unit
template void SimdByteSwap::ByteSwapSimd<16>(const void* source, const int len, void* dest);
template void SimdByteSwap::ByteSwapSimd<32>(const void* source, const int len, void* dest);
void BitUtil::ByteSwap(void* dest, const void* source, int len) {
// Branch selection according to current CPU capacity and input data length
if (LIKELY(len < 16)) {
SimdByteSwap::ByteSwapScalar(source, len, dest);
} else if (len >= 32) {
// AVX2 can only be used to process data whose size >= 32byte
if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
SimdByteSwap::ByteSwapSimd<32>(source, len, dest);
} else if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSSE3))) {
// SSSE3 support is more popular than AVX2.
SimdByteSwap::ByteSwapSimd<16>(source, len, dest);
} else {
SimdByteSwap::ByteSwapScalar(source, len, dest);
}
} else {
// SSSE3 can only be used to process data whose size >= 16byte
// 16 <= len < 32
if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSSE3))) {
SimdByteSwap::ByteSwapSimd<16>(source, len, dest);
} else {
SimdByteSwap::ByteSwapScalar(source, len, dest);
}
}
}
}