blob: 22f6972fb664b54cacad735cf821fb7b93f2f994 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "BpackingAvx512.hh"
#include "BitUnpackerAvx512.hh"
#include "CpuInfoUtil.hh"
#include "RLEv2.hh"
namespace orc {
UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) {
// PASS
}
UnpackAvx512::~UnpackAvx512() {
// PASS
}
template <bool hasBitOffset>
inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize,
uint64_t& startBit, uint64_t& bufMoveByteLen,
uint64_t& bufRestByteLen,
uint64_t& remainingNumElements,
uint64_t& tailBitLen, uint32_t& backupByteLen,
uint64_t& numElements, bool& resetBuf,
const uint8_t*& srcPtr, int64_t*& dstPtr) {
uint64_t numBits = remainingNumElements * bitWidth;
if (hasBitOffset && startBit != 0) {
numBits += startBit - ORC_VECTOR_BYTE_WIDTH;
}
bufMoveByteLen += moveByteLen(numBits);
if (bufMoveByteLen <= bufRestByteLen) {
numElements = remainingNumElements;
resetBuf = false;
remainingNumElements = 0;
} else {
uint64_t leadingBits = 0;
if (hasBitOffset && startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit;
uint64_t bufRestBitLen = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + leadingBits;
numElements = bufRestBitLen / bitWidth;
remainingNumElements -= numElements;
tailBitLen = fmod(bufRestBitLen, bitWidth);
resetBuf = true;
}
if (tailBitLen != 0) {
backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH;
tailBitLen = 0;
}
if (hasBitOffset && startBit > 0) {
uint32_t align = getAlign(startBit, bitWidth, bitMaxSize);
if (align > numElements) {
align = numElements;
}
if (align != 0) {
bufMoveByteLen -= moveByteLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH);
plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit);
srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
bufRestByteLen = decoder->bufLength();
dstPtr += align;
numElements -= align;
}
}
}
template <bool hasBitOffset>
inline void UnpackAvx512::alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit,
uint64_t& startBit, uint64_t& bufMoveByteLen,
uint64_t& bufRestByteLen,
uint64_t& remainingNumElements,
uint32_t& backupByteLen, uint64_t& numElements,
bool& resetBuf, const uint8_t*& srcPtr,
int64_t*& dstPtr) {
if (numElements > 0) {
uint64_t numBits = numElements * bitWidth;
if (hasBitOffset && startBit != 0) {
numBits += startBit - ORC_VECTOR_BYTE_WIDTH;
}
bufMoveByteLen -= moveByteLen(numBits);
if (hasBitOffset) {
plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit);
} else {
switch (specialBit) {
case 16:
unpackDefault.unrolledUnpack16(dstPtr, 0, numElements);
break;
case 24:
unpackDefault.unrolledUnpack24(dstPtr, 0, numElements);
break;
case 32:
unpackDefault.unrolledUnpack32(dstPtr, 0, numElements);
break;
default:
break;
}
}
srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
dstPtr += numElements;
bufRestByteLen = decoder->bufLength();
}
if (bufMoveByteLen <= bufRestByteLen) {
decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen);
return;
}
decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);
if (backupByteLen != 0) {
if (hasBitOffset) {
plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit);
} else {
switch (specialBit) {
case 16:
unpackDefault.unrolledUnpack16(dstPtr, 0, 1);
break;
case 24:
unpackDefault.unrolledUnpack24(dstPtr, 0, 1);
break;
case 32:
unpackDefault.unrolledUnpack32(dstPtr, 0, 1);
break;
default:
break;
}
}
dstPtr++;
backupByteLen = 0;
remainingNumElements--;
}
bufRestByteLen = decoder->bufLength();
bufMoveByteLen = 0;
srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
}
void UnpackAvx512::vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 1;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__m512i reverseMask1u = _mm512_loadu_si512(reverseMaskTable1u);
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint64_t src_64 = *reinterpret_cast<uint64_t*>(const_cast<uint8_t*>(srcPtr));
// convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF
__m512i srcmm = _mm512_movm_epi8(src_64);
// make 0x00 --> 0x00, 0xFF --> 0x01
srcmm = _mm512_abs_epi8(srcmm);
srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u);
_mm512_storeu_si512(simdPtr, srcmm);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 2;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements)
__m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr);
__m512i srcmm0, srcmm1, srcmm2, tmpmm;
srcmm2 = _mm512_srli_epi16(srcmm3, 2);
srcmm1 = _mm512_srli_epi16(srcmm3, 4);
srcmm0 = _mm512_srli_epi16(srcmm3, 6);
// turn 2 bitWidth into 8 by zeroing 3 of each 4 elements.
// move them into their places
// srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0
// srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0
tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00
srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00
srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn
// srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0
// srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0
tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00
srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00
srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op
tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl
srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop
srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop
srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop
srcmm0 = _mm512_and_si512(srcmm0, parse_mask);
_mm512_storeu_si512(simdPtr, srcmm0);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 3;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64));
__m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable3u);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable3u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable3u_1);
__m512i shiftMaskPtr[2];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable3u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable3u_1);
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr);
srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 4;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements)
__m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm0, srcmm1, tmpmm;
srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr);
srcmm0 = _mm512_srli_epi16(srcmm1, 4);
// move elements into their places
// srcmm0: a c e g 0 0 0 0
// srcmm1: b d f h 0 0 0 0
tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00
srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00
srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh
srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh
// turn 4 bitWidth into 8 by zeroing 4 of each 8 bits.
srcmm0 = _mm512_and_si512(srcmm0, parseMask);
_mm512_storeu_si512(simdPtr, srcmm0);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 5;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64));
__m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable5u);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable5u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable5u_1);
__m512i shiftMaskPtr[2];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable5u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable5u_1);
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr);
srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 6;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64));
__m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable6u);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable6u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable6u_1);
__m512i shiftMaskPtr[2];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable6u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable6u_1);
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr);
srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 7;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
uint8_t* simdPtr = reinterpret_cast<uint8_t*>(vectorBuf);
__mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64));
__m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable7u);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable7u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable7u_1);
__m512i shiftMaskPtr[2];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable7u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable7u_1);
while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr);
srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 8 * bitWidth;
decoder->resetBufferStart(8 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 8 * bitWidth;
numElements -= VECTOR_UNPACK_8BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 9;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable9u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable9u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable9u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable9u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable9u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable9u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable9u);
while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi16(zmm[0], 7);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 10;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable10u_0);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable10u);
__m512i shiftMask = _mm512_loadu_si512(shiftTable10u);
while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm;
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm = _mm512_srlv_epi16(zmm, shiftMask);
zmm = _mm512_and_si512(zmm, parseMask0);
_mm512_storeu_si512(simdPtr, zmm);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 11;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable11u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable11u_1);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable11u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable11u_1);
__m512i shiftMaskPtr[4];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable11u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable11u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable11u_2);
shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable11u_3);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable11u);
while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi16(zmm[0], 5);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 12;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable12u_0);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable12u);
__m512i shiftMask = _mm512_loadu_si512(shiftTable12u);
while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm;
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm);
zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm = _mm512_srlv_epi16(zmm, shiftMask);
zmm = _mm512_and_si512(zmm, parseMask0);
_mm512_storeu_si512(simdPtr, zmm);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 13;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable13u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable13u_1);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable13u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable13u_1);
__m512i shiftMaskPtr[4];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable13u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable13u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable13u_2);
shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable13u_3);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable13u);
while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi16(zmm[0], 3);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 14;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable14u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable14u_1);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable14u);
__m512i shiftMaskPtr[2];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable14u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable14u_1);
while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 15;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
__m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable15u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable15u_1);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable15u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable15u_1);
__m512i shiftMaskPtr[4];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable15u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable15u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable15u_2);
shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable15u_3);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable15u);
while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi16(zmm[0], 1);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u);
_mm512_storeu_si512(simdPtr, zmm[0]);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 16;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = len;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
int64_t* dstPtr = data + offset;
bool resetBuf = false;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
uint64_t startBit = 0;
while (len > 0) {
alignHeaderBoundary<false>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
__m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u);
while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
__m512i srcmm = _mm512_loadu_si512(srcPtr);
srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u);
_mm512_storeu_si512(simdPtr, srcmm);
srcPtr += 4 * bitWidth;
decoder->resetBufferStart(4 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 4 * bitWidth;
numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
}
}
alignTailerBoundary<false>(bitWidth, 16, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 17;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth);
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable17u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable17u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable17u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable17u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable17u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable17u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable17u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 15);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 18;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable18u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable18u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable18u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable18u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable18u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable18u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable18u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 14);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 19;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth);
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable19u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable19u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable19u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable19u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable19u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable19u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable19u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 13);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 20;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable20u_0);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable20u);
__m512i shiftMask = _mm512_loadu_si512(shiftTable20u);
while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm;
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm = _mm512_srlv_epi32(zmm, shiftMask);
zmm = _mm512_and_si512(zmm, parseMask0);
_mm512_storeu_si512(vectorBuf, zmm);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 21;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth);
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable21u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable21u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable21u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable21u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable21u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable21u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable21u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 11);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 22;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable22u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable22u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable22u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable22u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable22u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable22u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable22u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 10);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 23;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth);
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable23u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable23u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable23u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable23u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable23u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable23u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable23u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 9);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 24;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
uint64_t startBit = 0;
while (len > 0) {
alignHeaderBoundary<false>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i shuffleIdx = _mm512_loadu_si512(shuffleIdxTable24u_0);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable24u);
while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm;
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm);
zmm = _mm512_shuffle_epi8(zmm, shuffleIdx);
_mm512_storeu_si512(vectorBuf, zmm);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<false>(bitWidth, 24, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 26;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable26u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable26u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable26u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable26u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable26u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable26u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable26u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 6);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 28;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable28u_0);
__m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable28u);
__m512i shiftMask = _mm512_loadu_si512(shiftTable28u);
while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm;
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm);
zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm = _mm512_srlv_epi32(zmm, shiftMask);
zmm = _mm512_and_si512(zmm, parseMask0);
_mm512_storeu_si512(vectorBuf, zmm);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 30;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr[2];
shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable30u_0);
shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable30u_1);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable30u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable30u_1);
__m512i shiftMaskPtr[4];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable30u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable30u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable30u_2);
shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable30u_3);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable30u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u);
// shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 2u);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4u);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 32;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
uint64_t startBit = 0;
while (len > 0) {
alignHeaderBoundary<false>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm = _mm512_loadu_si512(srcPtr);
srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u);
_mm512_storeu_si512(vectorBuf, srcmm);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<false>(bitWidth, 32, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}
void UnpackAvx512::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs,
uint64_t& startBit) {
for (uint64_t i = offset; i < (offset + len); i++) {
uint64_t result = 0;
uint64_t bitsLeftToRead = fbs;
while (bitsLeftToRead > decoder->getBitsLeft()) {
result <<= decoder->getBitsLeft();
result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1);
bitsLeftToRead -= decoder->getBitsLeft();
decoder->setCurByte(decoder->readByte());
decoder->setBitsLeft(8);
}
// handle the left over bits
if (bitsLeftToRead > 0) {
result <<= bitsLeftToRead;
decoder->setBitsLeft(decoder->getBitsLeft() - static_cast<uint32_t>(bitsLeftToRead));
result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1);
}
data[i] = static_cast<int64_t>(result);
startBit = decoder->getBitsLeft() == 0 ? 0 : (8 - decoder->getBitsLeft());
}
}
void BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset,
uint64_t len, uint64_t fbs) {
UnpackAvx512 unpackAvx512(decoder);
UnpackDefault unpackDefault(decoder);
uint64_t startBit = 0;
static const auto cpu_info = CpuInfo::getInstance();
if (cpu_info->isSupported(CpuInfo::AVX512)) {
switch (fbs) {
case 1:
unpackAvx512.vectorUnpack1(data, offset, len);
break;
case 2:
unpackAvx512.vectorUnpack2(data, offset, len);
break;
case 3:
unpackAvx512.vectorUnpack3(data, offset, len);
break;
case 4:
unpackAvx512.vectorUnpack4(data, offset, len);
break;
case 5:
unpackAvx512.vectorUnpack5(data, offset, len);
break;
case 6:
unpackAvx512.vectorUnpack6(data, offset, len);
break;
case 7:
unpackAvx512.vectorUnpack7(data, offset, len);
break;
case 8:
unpackDefault.unrolledUnpack8(data, offset, len);
break;
case 9:
unpackAvx512.vectorUnpack9(data, offset, len);
break;
case 10:
unpackAvx512.vectorUnpack10(data, offset, len);
break;
case 11:
unpackAvx512.vectorUnpack11(data, offset, len);
break;
case 12:
unpackAvx512.vectorUnpack12(data, offset, len);
break;
case 13:
unpackAvx512.vectorUnpack13(data, offset, len);
break;
case 14:
unpackAvx512.vectorUnpack14(data, offset, len);
break;
case 15:
unpackAvx512.vectorUnpack15(data, offset, len);
break;
case 16:
unpackAvx512.vectorUnpack16(data, offset, len);
break;
case 17:
unpackAvx512.vectorUnpack17(data, offset, len);
break;
case 18:
unpackAvx512.vectorUnpack18(data, offset, len);
break;
case 19:
unpackAvx512.vectorUnpack19(data, offset, len);
break;
case 20:
unpackAvx512.vectorUnpack20(data, offset, len);
break;
case 21:
unpackAvx512.vectorUnpack21(data, offset, len);
break;
case 22:
unpackAvx512.vectorUnpack22(data, offset, len);
break;
case 23:
unpackAvx512.vectorUnpack23(data, offset, len);
break;
case 24:
unpackAvx512.vectorUnpack24(data, offset, len);
break;
case 26:
unpackAvx512.vectorUnpack26(data, offset, len);
break;
case 28:
unpackAvx512.vectorUnpack28(data, offset, len);
break;
case 30:
unpackAvx512.vectorUnpack30(data, offset, len);
break;
case 32:
unpackAvx512.vectorUnpack32(data, offset, len);
break;
case 40:
unpackDefault.unrolledUnpack40(data, offset, len);
break;
case 48:
unpackDefault.unrolledUnpack48(data, offset, len);
break;
case 56:
unpackDefault.unrolledUnpack56(data, offset, len);
break;
case 64:
unpackDefault.unrolledUnpack64(data, offset, len);
break;
default:
// Fallback to the default implementation for deprecated bit size.
unpackAvx512.plainUnpackLongs(data, offset, len, fbs, startBit);
break;
}
} else {
switch (fbs) {
case 4:
unpackDefault.unrolledUnpack4(data, offset, len);
break;
case 8:
unpackDefault.unrolledUnpack8(data, offset, len);
break;
case 16:
unpackDefault.unrolledUnpack16(data, offset, len);
break;
case 24:
unpackDefault.unrolledUnpack24(data, offset, len);
break;
case 32:
unpackDefault.unrolledUnpack32(data, offset, len);
break;
case 40:
unpackDefault.unrolledUnpack40(data, offset, len);
break;
case 48:
unpackDefault.unrolledUnpack48(data, offset, len);
break;
case 56:
unpackDefault.unrolledUnpack56(data, offset, len);
break;
case 64:
unpackDefault.unrolledUnpack64(data, offset, len);
break;
default:
// Fallback to the default implementation for deprecated bit size.
unpackDefault.plainUnpackLongs(data, offset, len, fbs);
break;
}
}
}
} // namespace orc