blob: a8e54e005fb2b784be5f06eb0e2a23fea7fb03df [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with option work for additional information
* regarding copyright ownership. The ASF licenses option file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use option file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "Adaptor.hh"
#include "Compression.hh"
#include "RLEv2.hh"
#include "RLEV2Util.hh"
#define MAX_LITERAL_SIZE 512
#define MAX_SHORT_REPEAT_LENGTH 10
namespace orc {
/**
* Compute the bits required to represent pth percentile value
* @param data - array
* @param p - percentile value (>=0.0 to <=1.0)
* @return pth percentile bits
*/
uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
if ((p > 1.0) || (p <= 0.0)) {
throw InvalidArgument("Invalid p value: " + to_string(p));
}
if (!reuseHist) {
// histogram that store the encoded bit requirement for each values.
// maximum number of bits that can encoded is 32 (refer FixedBitSizes)
memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t));
// compute the histogram
for(size_t i = offset; i < (offset + length); i++) {
uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
histgram[idx] += 1;
}
}
int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
// return the bits required by pth percentile length
for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
perLen -= histgram[i];
if (perLen < 0) {
return decodeBitWidth(static_cast<uint32_t>(i));
}
}
return 0;
}
RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
bool hasSigned, bool alignBitPacking) :
RleEncoder(std::move(outStream), hasSigned),
alignedBitPacking(alignBitPacking),
prevDelta(0){
literals = new int64_t[MAX_LITERAL_SIZE];
gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
adjDeltas = new int64_t[MAX_LITERAL_SIZE];
}
void RleEncoderV2::write(int64_t val) {
if(numLiterals == 0) {
initializeLiterals(val);
return;
}
if(numLiterals == 1) {
prevDelta = val - literals[0];
literals[numLiterals++] = val;
if(val == literals[0]) {
fixedRunLength = 2;
variableRunLength = 0;
} else {
fixedRunLength = 0;
variableRunLength = 2;
}
return;
}
int64_t currentDelta = val - literals[numLiterals - 1];
EncodingOption option = {};
if (prevDelta == 0 && currentDelta == 0) {
// case 1: fixed delta run
literals[numLiterals++] = val;
if (variableRunLength > 0) {
// if variable run is non-zero then we are seeing repeating
// values at the end of variable run in which case fixed Run
// length is 2
fixedRunLength = 2;
}
fixedRunLength++;
// if fixed run met the minimum condition and if variable
// run is non-zero then flush the variable run and shift the
// tail fixed runs to start of the buffer
if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
numLiterals -= MIN_REPEAT;
variableRunLength -= (MIN_REPEAT - 1);
determineEncoding(option);
writeValues(option);
// shift tail fixed runs to beginning of the buffer
for (size_t i = 0; i < MIN_REPEAT; ++i) {
literals[i] = val;
}
numLiterals = MIN_REPEAT;
}
if (fixedRunLength == MAX_LITERAL_SIZE) {;
determineEncoding(option);
writeValues(option);
}
return;
}
// case 2: variable delta run
// if fixed run length is non-zero and if it satisfies the
// short repeat conditions then write the values as short repeats
// else use delta encoding
if (fixedRunLength >= MIN_REPEAT) {
if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
option.encoding = SHORT_REPEAT;
} else {
option.encoding = DELTA;
option.isFixedDelta = true;
}
writeValues(option);
}
// if fixed run length is <MIN_REPEAT and current value is
// different from previous then treat it as variable run
if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
variableRunLength = fixedRunLength;
fixedRunLength = 0;
}
// after writing values re-initialize the variables
if (numLiterals == 0) {
initializeLiterals(val);
} else {
prevDelta = val - literals[numLiterals - 1];
literals[numLiterals++] = val;
variableRunLength++;
if (variableRunLength == MAX_LITERAL_SIZE) {
determineEncoding(option);
writeValues(option);
}
}
}
void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
int64_t zzEncVal = 0;
for (size_t i = 0; i < numLiterals; i++) {
if (isSigned) {
zzEncVal = zigZag(literals[i]);
} else {
zzEncVal = literals[i];
}
zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
}
}
void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
// mask will be max value beyond which patch will be generated
int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
// since we are considering only 95 percentile, the size of gap and
// patch array can contain only be 5% values
option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20)));
// #bit for patch
option.patchWidth = option.brBits100p - option.brBits95p;
option.patchWidth = getClosestFixedBits(option.patchWidth);
// if patch bit requirement is 64 then it will not possible to pack
// gap and patch together in a long. To make sure gap and patch can be
// packed together adjust the patch width
if (option.patchWidth == 64) {
option.patchWidth = 56;
option.brBits95p = 8;
mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1;
}
uint32_t gapIdx = 0;
uint32_t patchIdx = 0;
size_t prev = 0;
size_t maxGap = 0;
std::vector<int64_t> gapList;
std::vector<int64_t> patchList;
for(size_t i = 0; i < numLiterals; i++) {
// if value is above mask then create the patch and record the gap
if (baseRedLiterals[i] > mask) {
size_t gap = i - prev;
if (gap > maxGap) {
maxGap = gap;
}
// gaps are relative, so store the previous patched value index
prev = i;
gapList.push_back(static_cast<int64_t>(gap));
gapIdx++;
// extract the most significant bits that are over mask bits
int64_t patch = baseRedLiterals[i] >> option.brBits95p;
patchList.push_back(patch);
patchIdx++;
// strip off the MSB to enable safe bit packing
baseRedLiterals[i] &= mask;
}
}
// adjust the patch length to number of entries in gap list
option.patchLength = gapIdx;
// if the element to be patched is the first and only element then
// max gap will be 0, but to store the gap as 0 we need atleast 1 bit
if (maxGap == 0 && option.patchLength != 0) {
option.patchGapWidth = 1;
} else {
option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
}
// special case: if the patch gap width is greater than 256, then
// we need 9 bits to encode the gap width. But we only have 3 bits in
// header to record the gap width. To deal with this case, we will save
// two entries in patch list in the following way
// 256 gap width => 0 for patch value
// actual gap - 256 => actual patch value
// We will do the same for gap width = 511. If the element to be patched is
// the last element in the scope then gap width will be 511. In this case we
// will have 3 entries in the patch list in the following way
// 255 gap width => 0 for patch value
// 255 gap width => 0 for patch value
// 1 gap width => actual patch value
if (option.patchGapWidth > 8) {
option.patchGapWidth = 8;
// for gap = 511, we need two additional entries in patch list
if (maxGap == 511) {
option.patchLength += 2;
} else {
option.patchLength += 1;
}
}
// create gap vs patch list
gapIdx = 0;
patchIdx = 0;
for(size_t i = 0; i < option.patchLength; i++) {
int64_t g = gapList[gapIdx++];
int64_t p = patchList[patchIdx++];
while (g > 255) {
gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
i++;
g -= 255;
}
// store patch value in LSBs and gap in MSBs
gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
}
}
void RleEncoderV2::determineEncoding(EncodingOption& option) {
// We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
// but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
// computation when it's determined to be necessary.
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
computeZigZagLiterals(option);
option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
option.encoding = DIRECT;
return;
}
// DELTA encoding check
// for identifying monotonic sequences
bool isIncreasing = true;
bool isDecreasing = true;
option.isFixedDelta = true;
option.min = literals[0];
int64_t max = literals[0];
int64_t initialDelta = literals[1] - literals[0];
int64_t currDelta = 0;
int64_t deltaMax = 0;
adjDeltas[option.adjDeltasCount++] = initialDelta;
for (size_t i = 1; i < numLiterals; i++) {
const int64_t l1 = literals[i];
const int64_t l0 = literals[i - 1];
currDelta = l1 - l0;
option.min = std::min(option.min, l1);
max = std::max(max, l1);
isIncreasing &= (l0 <= l1);
isDecreasing &= (l0 >= l1);
option.isFixedDelta &= (currDelta == initialDelta);
if (i > 1) {
adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
}
}
// it's faster to exit under delta overflow condition without checking for
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!isSafeSubtract(max, option.min)) {
computeZigZagLiterals(option);
option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
option.encoding = DIRECT;
return;
}
// invariant - subtracting any number from any other in the literals after
// option point won't overflow
// if min is equal to max then the delta is 0, option condition happens for
// fixed values run >10 which cannot be encoded with SHORT_REPEAT
if (option.min == max) {
if (!option.isFixedDelta) {
throw InvalidArgument(to_string(option.min) + "==" +
to_string(max) + ", isFixedDelta cannot be false");
}
if(currDelta != 0) {
throw InvalidArgument(to_string(option.min) + "==" +
to_string(max) + ", currDelta should be zero");
}
option.fixedDelta = 0;
option.encoding = DELTA;
return;
}
if (option.isFixedDelta) {
if (currDelta != initialDelta) {
throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
}
option.encoding = DELTA;
option.fixedDelta = currDelta;
return;
}
// if initialDelta is 0 then we cannot delta encode as we cannot identify
// the sign of deltas (increasing or decreasing)
if (initialDelta != 0) {
// stores the number of bits required for packing delta blob in
// delta encoding
option.bitsDeltaMax = findClosestNumBits(deltaMax);
// monotonic condition
if (isIncreasing || isDecreasing) {
option.encoding = DELTA;
return;
}
}
// PATCHED_BASE encoding check
// percentile values are computed for the zigzag encoded values. if the
// number of bit requirement between 90th and 100th percentile varies
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
computeZigZagLiterals(option);
option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
// bits is > 1 then we need patch the values
if (diffBitsLH > 1) {
// patching is done only on base reduced values.
// remove base from literals
for (size_t i = 0; i < numLiterals; i++) {
baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
// 100th percentile is used to compute the max patch width
option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
// is no point in patching the values, in which case we will
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
if ((option.brBits100p - option.brBits95p) != 0) {
option.encoding = PATCHED_BASE;
preparePatchedBlob(option);
return;
} else {
option.encoding = DIRECT;
return;
}
} else {
// if difference in bits between 95th percentile and 100th percentile is
// 0, then patch length will become 0. Hence we will fallback to direct
option.encoding = DIRECT;
return;
}
}
uint64_t RleEncoderV2::flush() {
if (numLiterals != 0) {
EncodingOption option = {};
if (variableRunLength != 0) {
determineEncoding(option);
writeValues(option);
} else if (fixedRunLength != 0) {
if (fixedRunLength < MIN_REPEAT) {
variableRunLength = fixedRunLength;
fixedRunLength = 0;
determineEncoding(option);
writeValues(option);
} else if (fixedRunLength >= MIN_REPEAT
&& fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
option.encoding = SHORT_REPEAT;
writeValues(option);
} else {
option.encoding = DELTA;
option.isFixedDelta = true;
writeValues(option);
}
}
}
outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
uint64_t dataSize = outputStream->flush();
bufferLength = bufferPosition = 0;
return dataSize;
}
void RleEncoderV2::writeValues(EncodingOption& option) {
if (numLiterals != 0) {
switch (option.encoding) {
case SHORT_REPEAT:
writeShortRepeatValues(option);
break;
case DIRECT:
writeDirectValues(option);
break;
case PATCHED_BASE:
writePatchedBasedValues(option);
break;
case DELTA:
writeDeltaValues(option);
break;
default:
throw NotImplementedYet("Not implemented yet");
}
numLiterals = 0;
prevDelta = 0;
}
}
void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
int64_t repeatVal;
if (isSigned) {
repeatVal = zigZag(literals[0]);
} else {
repeatVal = literals[0];
}
const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
uint32_t header = getOpCode(SHORT_REPEAT);
fixedRunLength -= MIN_REPEAT;
header |= fixedRunLength;
header |= ((numBytesRepeatVal - 1) << 3);
writeByte(static_cast<char>(header));
for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
int64_t b = ((repeatVal >> (i * 8)) & 0xff);
writeByte(static_cast<char>(b));
}
fixedRunLength = 0;
}
void RleEncoderV2::writeDirectValues(EncodingOption& option) {
// write the number of fixed bits required in next 5 bits
uint32_t fb = option.zzBits100p;
if (alignedBitPacking) {
fb = getClosestAlignedFixedBits(fb);
}
const uint32_t efb = encodeBitWidth(fb) << 1;
// adjust variable run length
variableRunLength -= 1;
// extract the 9th bit of run length
const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
// create first byte of the header
const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits);
// second byte of the header stores the remaining 8 bits of runlength
const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
// write header
writeByte(headerFirstByte);
writeByte(headerSecondByte);
// bit packing the zigzag encoded literals
writeInts(zigzagLiterals, 0, numLiterals, fb);
// reset run length
variableRunLength = 0;
}
void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
// NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
// because patch is applied to MSB bits. For example: If fixed bit width of
// base value is 7 bits and if patch is 3 bits, the actual value is
// constructed by shifting the patch to left by 7 positions.
// actual_value = patch << 7 | base_value
// So, if we align base_value then actual_value can not be reconstructed.
// write the number of fixed bits required in next 5 bits
const uint32_t efb = encodeBitWidth(option.brBits95p) << 1;
// adjust variable run length, they are one off
variableRunLength -= 1;
// extract the 9th bit of run length
const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
// create first byte of the header
const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits);
// second byte of the header stores the remaining 8 bits of runlength
const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
// if the min value is negative toggle the sign
const bool isNegative = (option.min < 0);
if (isNegative) {
option.min = -option.min;
}
// find the number of bytes required for base and shift it by 5 bits
// to accommodate patch width. The additional bit is used to store the sign
// of the base value.
const uint32_t baseWidth = findClosestNumBits(option.min) + 1;
const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
const uint32_t bb = (baseBytes - 1) << 5;
// if the base value is negative then set MSB to 1
if (isNegative) {
option.min |= (1LL << ((baseBytes * 8) - 1));
}
// third byte contains 3 bits for number of bytes occupied by base
// and 5 bits for patchWidth
const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth));
// fourth byte contains 3 bits for page gap width and 5 bits for
// patch length
const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
// write header
writeByte(headerFirstByte);
writeByte(headerSecondByte);
writeByte(headerThirdByte);
writeByte(headerFourthByte);
// write the base value using fixed bytes in big endian order
for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
writeByte(b);
}
// base reduced literals are bit packed
uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p);
writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits);
// write patch list
closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth);
writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits);
// reset run length
variableRunLength = 0;
}
void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
uint32_t len = 0;
uint32_t fb = option.bitsDeltaMax;
uint32_t efb = 0;
if (alignedBitPacking) {
fb = getClosestAlignedFixedBits(fb);
}
if (option.isFixedDelta) {
// if fixed run length is greater than threshold then it will be fixed
// delta sequence with delta value 0 else fixed delta sequence with
// non-zero delta value
if (fixedRunLength > MIN_REPEAT) {
// ex. sequence: 2 2 2 2 2 2 2 2
len = fixedRunLength - 1;
fixedRunLength = 0;
} else {
// ex. sequence: 4 6 8 10 12 14 16
len = variableRunLength - 1;
variableRunLength = 0;
}
} else {
// fixed width 0 is used for long repeating values.
// sequences that require only 1 bit to encode will have an additional bit
if (fb == 1) {
fb = 2;
}
efb = encodeBitWidth(fb) << 1;
len = variableRunLength - 1;
variableRunLength = 0;
}
// extract the 9th bit of run length
const uint32_t tailBits = (len & 0x100) >> 8;
// create first byte of the header
const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits);
// second byte of the header stores the remaining 8 bits of runlength
const char headerSecondByte = static_cast<char>(len & 0xff);
// write header
writeByte(headerFirstByte);
writeByte(headerSecondByte);
// store the first value from zigzag literal array
if (isSigned) {
writeVslong(literals[0]);
} else {
writeVulong(literals[0]);
}
if (option.isFixedDelta) {
// if delta is fixed then we don't need to store delta blob
writeVslong(option.fixedDelta);
} else {
// store the first value as delta value using zigzag encoding
writeVslong(adjDeltas[0]);
// adjacent delta values are bit packed. The length of adjDeltas array is
// always one less than the number of literals (delta difference for n
// elements is n-1). We have already written one element, write the
// remaining numLiterals - 2 elements here
writeInts(adjDeltas, 1, numLiterals - 2, fb);
}
}
void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
if(input == nullptr || len < 1 || bitSize < 1) {
return;
}
if (getClosestAlignedFixedBits(bitSize) == bitSize) {
uint32_t numBytes;
uint32_t endOffSet = static_cast<uint32_t>(offset + len);
if (bitSize < 8 ) {;
char bitMask = static_cast<char>((1 << bitSize) - 1);
uint32_t numHops = 8 / bitSize;
uint32_t remainder = static_cast<uint32_t>(len % numHops);
uint32_t endUnroll = endOffSet - remainder;
for (uint32_t i = offset; i < endUnroll; i+=numHops) {
char toWrite = 0;
for (uint32_t j = 0; j < numHops; ++j) {
toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize));
}
writeByte(toWrite);
}
if (remainder > 0) {
uint32_t startShift = 8 - bitSize;
char toWrite = 0;
for (uint32_t i = endUnroll; i < endOffSet; ++i) {
toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
startShift -= bitSize;
}
writeByte(toWrite);
}
} else {
numBytes = bitSize / 8;
for (uint32_t i = offset; i < endOffSet; ++i) {
for (uint32_t j = 0; j < numBytes; ++j) {
char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
writeByte(toWrite);
}
}
}
return;
}
// write for unaligned bit size
uint32_t bitsLeft = 8;
char current = 0;
for(uint32_t i = offset; i < (offset + len); i++) {
int64_t value = input[i];
uint32_t bitsToWrite = bitSize;
while (bitsToWrite > bitsLeft) {
// add the bits to the bottom of the current word
current |= static_cast<char>(value >> (bitsToWrite - bitsLeft));
// subtract out the bits we just added
bitsToWrite -= bitsLeft;
// zero out the bits above bitsToWrite
value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1;
writeByte(current);
current = 0;
bitsLeft = 8;
}
bitsLeft -= bitsToWrite;
current |= static_cast<char>(value << bitsLeft);
if (bitsLeft == 0) {
writeByte(current);
current = 0;
bitsLeft = 8;
}
}
// flush
if (bitsLeft != 8) {
writeByte(current);
}
}
void RleEncoderV2::initializeLiterals(int64_t val) {
literals[numLiterals++] = val;
fixedRunLength = 1;
variableRunLength = 1;
}
}