| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with option work for additional information |
| * regarding copyright ownership. The ASF licenses option file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use option file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "Adaptor.hh" |
| #include "Compression.hh" |
| #include "RLEv2.hh" |
| #include "RLEV2Util.hh" |
| |
| #define MAX_LITERAL_SIZE 512 |
| #define MAX_SHORT_REPEAT_LENGTH 10 |
| |
| namespace orc { |
| |
| /** |
| * Compute the bits required to represent pth percentile value |
| * @param data - array |
| * @param p - percentile value (>=0.0 to <=1.0) |
| * @return pth percentile bits |
| */ |
| uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { |
| if ((p > 1.0) || (p <= 0.0)) { |
| throw InvalidArgument("Invalid p value: " + to_string(p)); |
| } |
| |
| if (!reuseHist) { |
| // histogram that store the encoded bit requirement for each values. |
| // maximum number of bits that can encoded is 32 (refer FixedBitSizes) |
| memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); |
| // compute the histogram |
| for(size_t i = offset; i < (offset + length); i++) { |
| uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); |
| histgram[idx] += 1; |
| } |
| } |
| |
| int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); |
| |
| // return the bits required by pth percentile length |
| for(int32_t i = HIST_LEN - 1; i >= 0; i--) { |
| perLen -= histgram[i]; |
| if (perLen < 0) { |
| return decodeBitWidth(static_cast<uint32_t>(i)); |
| } |
| } |
| return 0; |
| } |
| |
| RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, |
| bool hasSigned, bool alignBitPacking) : |
| RleEncoder(std::move(outStream), hasSigned), |
| alignedBitPacking(alignBitPacking), |
| prevDelta(0){ |
| literals = new int64_t[MAX_LITERAL_SIZE]; |
| gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; |
| zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; |
| baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; |
| adjDeltas = new int64_t[MAX_LITERAL_SIZE]; |
| } |
| |
| void RleEncoderV2::write(int64_t val) { |
| if(numLiterals == 0) { |
| initializeLiterals(val); |
| return; |
| } |
| |
| if(numLiterals == 1) { |
| prevDelta = val - literals[0]; |
| literals[numLiterals++] = val; |
| |
| if(val == literals[0]) { |
| fixedRunLength = 2; |
| variableRunLength = 0; |
| } else { |
| fixedRunLength = 0; |
| variableRunLength = 2; |
| } |
| return; |
| } |
| |
| int64_t currentDelta = val - literals[numLiterals - 1]; |
| EncodingOption option = {}; |
| if (prevDelta == 0 && currentDelta == 0) { |
| // case 1: fixed delta run |
| literals[numLiterals++] = val; |
| |
| if (variableRunLength > 0) { |
| // if variable run is non-zero then we are seeing repeating |
| // values at the end of variable run in which case fixed Run |
| // length is 2 |
| fixedRunLength = 2; |
| } |
| fixedRunLength++; |
| |
| // if fixed run met the minimum condition and if variable |
| // run is non-zero then flush the variable run and shift the |
| // tail fixed runs to start of the buffer |
| if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { |
| numLiterals -= MIN_REPEAT; |
| variableRunLength -= (MIN_REPEAT - 1); |
| |
| determineEncoding(option); |
| writeValues(option); |
| |
| // shift tail fixed runs to beginning of the buffer |
| for (size_t i = 0; i < MIN_REPEAT; ++i) { |
| literals[i] = val; |
| } |
| numLiterals = MIN_REPEAT; |
| } |
| |
| if (fixedRunLength == MAX_LITERAL_SIZE) {; |
| determineEncoding(option); |
| writeValues(option); |
| } |
| return; |
| } |
| |
| // case 2: variable delta run |
| |
| // if fixed run length is non-zero and if it satisfies the |
| // short repeat conditions then write the values as short repeats |
| // else use delta encoding |
| if (fixedRunLength >= MIN_REPEAT) { |
| if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { |
| option.encoding = SHORT_REPEAT; |
| } else { |
| option.encoding = DELTA; |
| option.isFixedDelta = true; |
| } |
| writeValues(option); |
| } |
| |
| // if fixed run length is <MIN_REPEAT and current value is |
| // different from previous then treat it as variable run |
| if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { |
| variableRunLength = fixedRunLength; |
| fixedRunLength = 0; |
| } |
| |
| // after writing values re-initialize the variables |
| if (numLiterals == 0) { |
| initializeLiterals(val); |
| } else { |
| prevDelta = val - literals[numLiterals - 1]; |
| literals[numLiterals++] = val; |
| variableRunLength++; |
| |
| if (variableRunLength == MAX_LITERAL_SIZE) { |
| determineEncoding(option); |
| writeValues(option); |
| } |
| } |
| } |
| |
| void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { |
| int64_t zzEncVal = 0; |
| for (size_t i = 0; i < numLiterals; i++) { |
| if (isSigned) { |
| zzEncVal = zigZag(literals[i]); |
| } else { |
| zzEncVal = literals[i]; |
| } |
| zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; |
| } |
| } |
| |
| void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { |
| // mask will be max value beyond which patch will be generated |
| int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; |
| |
| // since we are considering only 95 percentile, the size of gap and |
| // patch array can contain only be 5% values |
| option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20))); |
| |
| // #bit for patch |
| option.patchWidth = option.brBits100p - option.brBits95p; |
| option.patchWidth = getClosestFixedBits(option.patchWidth); |
| |
| // if patch bit requirement is 64 then it will not possible to pack |
| // gap and patch together in a long. To make sure gap and patch can be |
| // packed together adjust the patch width |
| if (option.patchWidth == 64) { |
| option.patchWidth = 56; |
| option.brBits95p = 8; |
| mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; |
| } |
| |
| uint32_t gapIdx = 0; |
| uint32_t patchIdx = 0; |
| size_t prev = 0; |
| size_t maxGap = 0; |
| |
| std::vector<int64_t> gapList; |
| std::vector<int64_t> patchList; |
| |
| for(size_t i = 0; i < numLiterals; i++) { |
| // if value is above mask then create the patch and record the gap |
| if (baseRedLiterals[i] > mask) { |
| size_t gap = i - prev; |
| if (gap > maxGap) { |
| maxGap = gap; |
| } |
| |
| // gaps are relative, so store the previous patched value index |
| prev = i; |
| gapList.push_back(static_cast<int64_t>(gap)); |
| gapIdx++; |
| |
| // extract the most significant bits that are over mask bits |
| int64_t patch = baseRedLiterals[i] >> option.brBits95p; |
| patchList.push_back(patch); |
| patchIdx++; |
| |
| // strip off the MSB to enable safe bit packing |
| baseRedLiterals[i] &= mask; |
| } |
| } |
| |
| // adjust the patch length to number of entries in gap list |
| option.patchLength = gapIdx; |
| |
| // if the element to be patched is the first and only element then |
| // max gap will be 0, but to store the gap as 0 we need atleast 1 bit |
| if (maxGap == 0 && option.patchLength != 0) { |
| option.patchGapWidth = 1; |
| } else { |
| option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); |
| } |
| |
| // special case: if the patch gap width is greater than 256, then |
| // we need 9 bits to encode the gap width. But we only have 3 bits in |
| // header to record the gap width. To deal with this case, we will save |
| // two entries in patch list in the following way |
| // 256 gap width => 0 for patch value |
| // actual gap - 256 => actual patch value |
| // We will do the same for gap width = 511. If the element to be patched is |
| // the last element in the scope then gap width will be 511. In this case we |
| // will have 3 entries in the patch list in the following way |
| // 255 gap width => 0 for patch value |
| // 255 gap width => 0 for patch value |
| // 1 gap width => actual patch value |
| if (option.patchGapWidth > 8) { |
| option.patchGapWidth = 8; |
| // for gap = 511, we need two additional entries in patch list |
| if (maxGap == 511) { |
| option.patchLength += 2; |
| } else { |
| option.patchLength += 1; |
| } |
| } |
| |
| // create gap vs patch list |
| gapIdx = 0; |
| patchIdx = 0; |
| for(size_t i = 0; i < option.patchLength; i++) { |
| int64_t g = gapList[gapIdx++]; |
| int64_t p = patchList[patchIdx++]; |
| while (g > 255) { |
| gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); |
| i++; |
| g -= 255; |
| } |
| |
| // store patch value in LSBs and gap in MSBs |
| gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); |
| } |
| } |
| |
| void RleEncoderV2::determineEncoding(EncodingOption& option) { |
| // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, |
| // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag |
| // computation when it's determined to be necessary. |
| |
| // not a big win for shorter runs to determine encoding |
| if (numLiterals <= MIN_REPEAT) { |
| // we need to compute zigzag values for DIRECT encoding if we decide to |
| // break early for delta overflows or for shorter runs |
| computeZigZagLiterals(option); |
| option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); |
| option.encoding = DIRECT; |
| return; |
| } |
| |
| // DELTA encoding check |
| |
| // for identifying monotonic sequences |
| bool isIncreasing = true; |
| bool isDecreasing = true; |
| option.isFixedDelta = true; |
| |
| option.min = literals[0]; |
| int64_t max = literals[0]; |
| int64_t initialDelta = literals[1] - literals[0]; |
| int64_t currDelta = 0; |
| int64_t deltaMax = 0; |
| adjDeltas[option.adjDeltasCount++] = initialDelta; |
| |
| for (size_t i = 1; i < numLiterals; i++) { |
| const int64_t l1 = literals[i]; |
| const int64_t l0 = literals[i - 1]; |
| currDelta = l1 - l0; |
| option.min = std::min(option.min, l1); |
| max = std::max(max, l1); |
| |
| isIncreasing &= (l0 <= l1); |
| isDecreasing &= (l0 >= l1); |
| |
| option.isFixedDelta &= (currDelta == initialDelta); |
| if (i > 1) { |
| adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); |
| deltaMax = std::max(deltaMax, adjDeltas[i - 1]); |
| } |
| } |
| |
| // it's faster to exit under delta overflow condition without checking for |
| // PATCHED_BASE condition as encoding using DIRECT is faster and has less |
| // overhead than PATCHED_BASE |
| if (!isSafeSubtract(max, option.min)) { |
| computeZigZagLiterals(option); |
| option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); |
| option.encoding = DIRECT; |
| return; |
| } |
| |
| // invariant - subtracting any number from any other in the literals after |
| // option point won't overflow |
| |
| // if min is equal to max then the delta is 0, option condition happens for |
| // fixed values run >10 which cannot be encoded with SHORT_REPEAT |
| if (option.min == max) { |
| if (!option.isFixedDelta) { |
| throw InvalidArgument(to_string(option.min) + "==" + |
| to_string(max) + ", isFixedDelta cannot be false"); |
| } |
| |
| if(currDelta != 0) { |
| throw InvalidArgument(to_string(option.min) + "==" + |
| to_string(max) + ", currDelta should be zero"); |
| } |
| option.fixedDelta = 0; |
| option.encoding = DELTA; |
| return; |
| } |
| |
| if (option.isFixedDelta) { |
| if (currDelta != initialDelta) { |
| throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); |
| } |
| |
| option.encoding = DELTA; |
| option.fixedDelta = currDelta; |
| return; |
| } |
| |
| // if initialDelta is 0 then we cannot delta encode as we cannot identify |
| // the sign of deltas (increasing or decreasing) |
| if (initialDelta != 0) { |
| // stores the number of bits required for packing delta blob in |
| // delta encoding |
| option.bitsDeltaMax = findClosestNumBits(deltaMax); |
| |
| // monotonic condition |
| if (isIncreasing || isDecreasing) { |
| option.encoding = DELTA; |
| return; |
| } |
| } |
| |
| // PATCHED_BASE encoding check |
| |
| // percentile values are computed for the zigzag encoded values. if the |
| // number of bit requirement between 90th and 100th percentile varies |
| // beyond a threshold then we need to patch the values. if the variation |
| // is not significant then we can use direct encoding |
| |
| computeZigZagLiterals(option); |
| option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); |
| option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true); |
| uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p; |
| |
| // if the difference between 90th percentile and 100th percentile fixed |
| // bits is > 1 then we need patch the values |
| if (diffBitsLH > 1) { |
| |
| // patching is done only on base reduced values. |
| // remove base from literals |
| for (size_t i = 0; i < numLiterals; i++) { |
| baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); |
| } |
| |
| // 95th percentile width is used to determine max allowed value |
| // after which patching will be done |
| option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); |
| |
| // 100th percentile is used to compute the max patch width |
| option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); |
| |
| // after base reducing the values, if the difference in bits between |
| // 95th percentile and 100th percentile value is zero then there |
| // is no point in patching the values, in which case we will |
| // fallback to DIRECT encoding. |
| // The decision to use patched base was based on zigzag values, but the |
| // actual patching is done on base reduced literals. |
| if ((option.brBits100p - option.brBits95p) != 0) { |
| option.encoding = PATCHED_BASE; |
| preparePatchedBlob(option); |
| return; |
| } else { |
| option.encoding = DIRECT; |
| return; |
| } |
| } else { |
| // if difference in bits between 95th percentile and 100th percentile is |
| // 0, then patch length will become 0. Hence we will fallback to direct |
| option.encoding = DIRECT; |
| return; |
| } |
| } |
| |
| uint64_t RleEncoderV2::flush() { |
| if (numLiterals != 0) { |
| EncodingOption option = {}; |
| if (variableRunLength != 0) { |
| determineEncoding(option); |
| writeValues(option); |
| } else if (fixedRunLength != 0) { |
| if (fixedRunLength < MIN_REPEAT) { |
| variableRunLength = fixedRunLength; |
| fixedRunLength = 0; |
| determineEncoding(option); |
| writeValues(option); |
| } else if (fixedRunLength >= MIN_REPEAT |
| && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { |
| option.encoding = SHORT_REPEAT; |
| writeValues(option); |
| } else { |
| option.encoding = DELTA; |
| option.isFixedDelta = true; |
| writeValues(option); |
| } |
| } |
| } |
| |
| outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); |
| uint64_t dataSize = outputStream->flush(); |
| bufferLength = bufferPosition = 0; |
| return dataSize; |
| } |
| |
| void RleEncoderV2::writeValues(EncodingOption& option) { |
| if (numLiterals != 0) { |
| switch (option.encoding) { |
| case SHORT_REPEAT: |
| writeShortRepeatValues(option); |
| break; |
| case DIRECT: |
| writeDirectValues(option); |
| break; |
| case PATCHED_BASE: |
| writePatchedBasedValues(option); |
| break; |
| case DELTA: |
| writeDeltaValues(option); |
| break; |
| default: |
| throw NotImplementedYet("Not implemented yet"); |
| } |
| |
| numLiterals = 0; |
| prevDelta = 0; |
| } |
| } |
| |
| void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { |
| int64_t repeatVal; |
| if (isSigned) { |
| repeatVal = zigZag(literals[0]); |
| } else { |
| repeatVal = literals[0]; |
| } |
| |
| const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); |
| const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); |
| |
| uint32_t header = getOpCode(SHORT_REPEAT); |
| |
| fixedRunLength -= MIN_REPEAT; |
| header |= fixedRunLength; |
| header |= ((numBytesRepeatVal - 1) << 3); |
| |
| writeByte(static_cast<char>(header)); |
| |
| for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { |
| int64_t b = ((repeatVal >> (i * 8)) & 0xff); |
| writeByte(static_cast<char>(b)); |
| } |
| |
| fixedRunLength = 0; |
| } |
| |
| void RleEncoderV2::writeDirectValues(EncodingOption& option) { |
| // write the number of fixed bits required in next 5 bits |
| uint32_t fb = option.zzBits100p; |
| if (alignedBitPacking) { |
| fb = getClosestAlignedFixedBits(fb); |
| } |
| |
| const uint32_t efb = encodeBitWidth(fb) << 1; |
| |
| // adjust variable run length |
| variableRunLength -= 1; |
| |
| // extract the 9th bit of run length |
| const uint32_t tailBits = (variableRunLength & 0x100) >> 8; |
| |
| // create first byte of the header |
| const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); |
| |
| // write header |
| writeByte(headerFirstByte); |
| writeByte(headerSecondByte); |
| |
| // bit packing the zigzag encoded literals |
| writeInts(zigzagLiterals, 0, numLiterals, fb); |
| |
| // reset run length |
| variableRunLength = 0; |
| } |
| |
| void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { |
| // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding |
| // because patch is applied to MSB bits. For example: If fixed bit width of |
| // base value is 7 bits and if patch is 3 bits, the actual value is |
| // constructed by shifting the patch to left by 7 positions. |
| // actual_value = patch << 7 | base_value |
| // So, if we align base_value then actual_value can not be reconstructed. |
| |
| // write the number of fixed bits required in next 5 bits |
| const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; |
| |
| // adjust variable run length, they are one off |
| variableRunLength -= 1; |
| |
| // extract the 9th bit of run length |
| const uint32_t tailBits = (variableRunLength & 0x100) >> 8; |
| |
| // create first byte of the header |
| const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); |
| |
| // if the min value is negative toggle the sign |
| const bool isNegative = (option.min < 0); |
| if (isNegative) { |
| option.min = -option.min; |
| } |
| |
| // find the number of bytes required for base and shift it by 5 bits |
| // to accommodate patch width. The additional bit is used to store the sign |
| // of the base value. |
| const uint32_t baseWidth = findClosestNumBits(option.min) + 1; |
| const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; |
| const uint32_t bb = (baseBytes - 1) << 5; |
| |
| // if the base value is negative then set MSB to 1 |
| if (isNegative) { |
| option.min |= (1LL << ((baseBytes * 8) - 1)); |
| } |
| |
| // third byte contains 3 bits for number of bytes occupied by base |
| // and 5 bits for patchWidth |
| const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth)); |
| |
| // fourth byte contains 3 bits for page gap width and 5 bits for |
| // patch length |
| const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); |
| |
| // write header |
| writeByte(headerFirstByte); |
| writeByte(headerSecondByte); |
| writeByte(headerThirdByte); |
| writeByte(headerFourthByte); |
| |
| // write the base value using fixed bytes in big endian order |
| for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { |
| char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); |
| writeByte(b); |
| } |
| |
| // base reduced literals are bit packed |
| uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); |
| |
| writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); |
| |
| // write patch list |
| closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); |
| |
| writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); |
| |
| // reset run length |
| variableRunLength = 0; |
| } |
| |
| void RleEncoderV2::writeDeltaValues(EncodingOption& option) { |
| uint32_t len = 0; |
| uint32_t fb = option.bitsDeltaMax; |
| uint32_t efb = 0; |
| |
| if (alignedBitPacking) { |
| fb = getClosestAlignedFixedBits(fb); |
| } |
| |
| if (option.isFixedDelta) { |
| // if fixed run length is greater than threshold then it will be fixed |
| // delta sequence with delta value 0 else fixed delta sequence with |
| // non-zero delta value |
| if (fixedRunLength > MIN_REPEAT) { |
| // ex. sequence: 2 2 2 2 2 2 2 2 |
| len = fixedRunLength - 1; |
| fixedRunLength = 0; |
| } else { |
| // ex. sequence: 4 6 8 10 12 14 16 |
| len = variableRunLength - 1; |
| variableRunLength = 0; |
| } |
| } else { |
| // fixed width 0 is used for long repeating values. |
| // sequences that require only 1 bit to encode will have an additional bit |
| if (fb == 1) { |
| fb = 2; |
| } |
| efb = encodeBitWidth(fb) << 1; |
| len = variableRunLength - 1; |
| variableRunLength = 0; |
| } |
| |
| // extract the 9th bit of run length |
| const uint32_t tailBits = (len & 0x100) >> 8; |
| |
| // create first byte of the header |
| const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits); |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| const char headerSecondByte = static_cast<char>(len & 0xff); |
| |
| // write header |
| writeByte(headerFirstByte); |
| writeByte(headerSecondByte); |
| |
| // store the first value from zigzag literal array |
| if (isSigned) { |
| writeVslong(literals[0]); |
| } else { |
| writeVulong(literals[0]); |
| } |
| |
| if (option.isFixedDelta) { |
| // if delta is fixed then we don't need to store delta blob |
| writeVslong(option.fixedDelta); |
| } else { |
| // store the first value as delta value using zigzag encoding |
| writeVslong(adjDeltas[0]); |
| |
| // adjacent delta values are bit packed. The length of adjDeltas array is |
| // always one less than the number of literals (delta difference for n |
| // elements is n-1). We have already written one element, write the |
| // remaining numLiterals - 2 elements here |
| writeInts(adjDeltas, 1, numLiterals - 2, fb); |
| } |
| } |
| |
| void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { |
| if(input == nullptr || len < 1 || bitSize < 1) { |
| return; |
| } |
| |
| if (getClosestAlignedFixedBits(bitSize) == bitSize) { |
| uint32_t numBytes; |
| uint32_t endOffSet = static_cast<uint32_t>(offset + len); |
| if (bitSize < 8 ) {; |
| char bitMask = static_cast<char>((1 << bitSize) - 1); |
| uint32_t numHops = 8 / bitSize; |
| uint32_t remainder = static_cast<uint32_t>(len % numHops); |
| uint32_t endUnroll = endOffSet - remainder; |
| for (uint32_t i = offset; i < endUnroll; i+=numHops) { |
| char toWrite = 0; |
| for (uint32_t j = 0; j < numHops; ++j) { |
| toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); |
| } |
| writeByte(toWrite); |
| } |
| |
| if (remainder > 0) { |
| uint32_t startShift = 8 - bitSize; |
| char toWrite = 0; |
| for (uint32_t i = endUnroll; i < endOffSet; ++i) { |
| toWrite |= static_cast<char>((input[i] & bitMask) << startShift); |
| startShift -= bitSize; |
| } |
| writeByte(toWrite); |
| } |
| |
| } else { |
| numBytes = bitSize / 8; |
| |
| for (uint32_t i = offset; i < endOffSet; ++i) { |
| for (uint32_t j = 0; j < numBytes; ++j) { |
| char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); |
| writeByte(toWrite); |
| } |
| } |
| } |
| |
| return; |
| } |
| |
| // write for unaligned bit size |
| uint32_t bitsLeft = 8; |
| char current = 0; |
| for(uint32_t i = offset; i < (offset + len); i++) { |
| int64_t value = input[i]; |
| uint32_t bitsToWrite = bitSize; |
| while (bitsToWrite > bitsLeft) { |
| // add the bits to the bottom of the current word |
| current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); |
| // subtract out the bits we just added |
| bitsToWrite -= bitsLeft; |
| // zero out the bits above bitsToWrite |
| value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; |
| writeByte(current); |
| current = 0; |
| bitsLeft = 8; |
| } |
| bitsLeft -= bitsToWrite; |
| current |= static_cast<char>(value << bitsLeft); |
| if (bitsLeft == 0) { |
| writeByte(current); |
| current = 0; |
| bitsLeft = 8; |
| } |
| } |
| |
| // flush |
| if (bitsLeft != 8) { |
| writeByte(current); |
| } |
| } |
| |
| void RleEncoderV2::initializeLiterals(int64_t val) { |
| literals[numLiterals++] = val; |
| fixedRunLength = 1; |
| variableRunLength = 1; |
| } |
| } |