| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.orc.impl; |
| |
| import java.io.IOException; |
| import java.util.function.Consumer; |
| |
| /** |
| * <p>A writer that performs light weight compression over sequence of integers. |
| * </p> |
| * <p>There are four types of lightweight integer compression</p> |
| * <ul> |
| * <li>SHORT_REPEAT</li> |
| * <li>DIRECT</li> |
| * <li>PATCHED_BASE</li> |
| * <li>DELTA</li> |
| * </ul> |
| * <p>The description and format for these types are as below: |
| * <b>SHORT_REPEAT:</b> Used for short repeated integer sequences.</p> |
| * <ul> |
| * <li>1 byte header |
| * <ul> |
| * <li>2 bits for encoding type</li> |
| * <li>3 bits for bytes required for repeating value</li> |
| * <li>3 bits for repeat count (MIN_REPEAT + run length)</li> |
| * </ul> |
| * </li> |
| * <li>Blob - repeat value (fixed bytes)</li> |
| * </ul> |
| * <p> |
| * <b>DIRECT:</b> Used for random integer sequences whose number of bit |
| * requirement doesn't vary a lot.</p> |
| * <ul> |
| * <li>2 byte header (1st byte) |
| * <ul> |
| * <li>2 bits for encoding type</li> |
| * <li>5 bits for fixed bit width of values in blob</li> |
| * <li>1 bit for storing MSB of run length</li> |
| * </ul></li> |
| * <li>2nd byte |
| * <ul> |
| * <li>8 bits for lower run length bits</li> |
| * </ul> |
| * </li> |
| * <li>Blob - stores the direct values using fixed bit width. The length of the |
| * data blob is (fixed width * run length) bits long</li> |
| * </ul> |
| * <p> |
| * <b>PATCHED_BASE:</b> Used for random integer sequences whose number of bit |
| * requirement varies beyond a threshold.</p> |
| * <ul> |
| * <li>4 bytes header (1st byte) |
| * <ul> |
| * <li>2 bits for encoding type</li> |
| * <li>5 bits for fixed bit width of values in blob</li> |
| * <li>1 bit for storing MSB of run length</li> |
| * </ul></li> |
| * <li>2nd byte |
| * <ul> |
| * <li>8 bits for lower run length bits</li> |
| * </ul></li> |
| * <li>3rd byte |
| * <ul> |
| * <li>3 bits for bytes required to encode base value</li> |
| * <li>5 bits for patch width</li> |
| * </ul></li> |
| * <li>4th byte |
| * <ul> |
| * <li>3 bits for patch gap width</li> |
| * <li>5 bits for patch length</li> |
| * </ul> |
| * </li> |
| * <li>Base value - Stored using fixed number of bytes. If MSB is set, base |
| * value is negative else positive. Length of base value is (base width * 8) |
| * bits.</li> |
| * <li>Data blob - Base reduced values as stored using fixed bit width. Length |
| * of data blob is (fixed width * run length) bits.</li> |
| * <li>Patch blob - Patch blob is a list of gap and patch value. Each entry in |
| * the patch list is (patch width + patch gap width) bits long. Gap between the |
| * subsequent elements to be patched are stored in upper part of entry whereas |
| * patch values are stored in lower part of entry. Length of patch blob is |
| * ((patch width + patch gap width) * patch length) bits.</li> |
| * </ul> |
| * <p> |
| * <b>DELTA</b> Used for monotonically increasing or decreasing sequences, |
| * sequences with fixed delta values or long repeated sequences. |
| * <ul> |
| * <li>2 bytes header (1st byte) |
| * <ul> |
| * <li>2 bits for encoding type</li> |
| * <li>5 bits for fixed bit width of values in blob</li> |
| * <li>1 bit for storing MSB of run length</li> |
| * </ul></li> |
| * <li>2nd byte |
| * <ul> |
| * <li>8 bits for lower run length bits</li> |
| * </ul></li> |
| * <li>Base value - zigzag encoded value written as varint</li> |
| * <li>Delta base - zigzag encoded value written as varint</li> |
| * <li>Delta blob - only positive values. monotonicity and orderness are decided |
| * based on the sign of the base value and delta base</li> |
| * </ul> |
| */ |
| public class RunLengthIntegerWriterV2 implements IntegerWriter { |
| |
| public enum EncodingType { |
| SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA |
| } |
| |
| static final int MAX_SCOPE = 512; |
| static final int MIN_REPEAT = 3; |
| static final long BASE_VALUE_LIMIT = 1l << 56; |
| private static final int MAX_SHORT_REPEAT_LENGTH = 10; |
| private long prevDelta = 0; |
| private int fixedRunLength = 0; |
| private int variableRunLength = 0; |
| private final long[] literals = new long[MAX_SCOPE]; |
| private final PositionedOutputStream output; |
| private final boolean signed; |
| private EncodingType encoding; |
| private int numLiterals; |
| private final long[] zigzagLiterals = new long[MAX_SCOPE]; |
| private final long[] baseRedLiterals = new long[MAX_SCOPE]; |
| private final long[] adjDeltas = new long[MAX_SCOPE]; |
| private long fixedDelta; |
| private int zzBits90p; |
| private int zzBits100p; |
| private int brBits95p; |
| private int brBits100p; |
| private int bitsDeltaMax; |
| private int patchWidth; |
| private int patchGapWidth; |
| private int patchLength; |
| private long[] gapVsPatchList; |
| private long min; |
| private boolean isFixedDelta; |
| private SerializationUtils utils; |
| private boolean alignedBitpacking; |
| |
| RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) { |
| this(output, signed, true); |
| } |
| |
| public RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed, |
| boolean alignedBitpacking) { |
| this.output = output; |
| this.signed = signed; |
| this.alignedBitpacking = alignedBitpacking; |
| this.utils = new SerializationUtils(); |
| clear(); |
| } |
| |
| private void writeValues() throws IOException { |
| if (numLiterals != 0) { |
| |
| if (encoding.equals(EncodingType.SHORT_REPEAT)) { |
| writeShortRepeatValues(); |
| } else if (encoding.equals(EncodingType.DIRECT)) { |
| writeDirectValues(); |
| } else if (encoding.equals(EncodingType.PATCHED_BASE)) { |
| writePatchedBaseValues(); |
| } else { |
| writeDeltaValues(); |
| } |
| |
| // clear all the variables |
| clear(); |
| } |
| } |
| |
| private void writeDeltaValues() throws IOException { |
| int len = 0; |
| int fb = bitsDeltaMax; |
| int efb = 0; |
| |
| if (alignedBitpacking) { |
| fb = utils.getClosestAlignedFixedBits(fb); |
| } |
| |
| if (isFixedDelta) { |
| // if fixed run length is greater than threshold then it will be fixed |
| // delta sequence with delta value 0 else fixed delta sequence with |
| // non-zero delta value |
| if (fixedRunLength > MIN_REPEAT) { |
| // ex. sequence: 2 2 2 2 2 2 2 2 |
| len = fixedRunLength - 1; |
| fixedRunLength = 0; |
| } else { |
| // ex. sequence: 4 6 8 10 12 14 16 |
| len = variableRunLength - 1; |
| variableRunLength = 0; |
| } |
| } else { |
| // fixed width 0 is used for long repeating values. |
| // sequences that require only 1 bit to encode will have an additional bit |
| if (fb == 1) { |
| fb = 2; |
| } |
| efb = utils.encodeBitWidth(fb); |
| efb = efb << 1; |
| len = variableRunLength - 1; |
| variableRunLength = 0; |
| } |
| |
| // extract the 9th bit of run length |
| final int tailBits = (len & 0x100) >>> 8; |
| |
| // create first byte of the header |
| final int headerFirstByte = getOpcode() | efb | tailBits; |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| final int headerSecondByte = len & 0xff; |
| |
| // write header |
| output.write(headerFirstByte); |
| output.write(headerSecondByte); |
| |
| // store the first value from zigzag literal array |
| if (signed) { |
| utils.writeVslong(output, literals[0]); |
| } else { |
| utils.writeVulong(output, literals[0]); |
| } |
| |
| if (isFixedDelta) { |
| // if delta is fixed then we don't need to store delta blob |
| utils.writeVslong(output, fixedDelta); |
| } else { |
| // store the first value as delta value using zigzag encoding |
| utils.writeVslong(output, adjDeltas[0]); |
| |
| // adjacent delta values are bit packed. The length of adjDeltas array is |
| // always one less than the number of literals (delta difference for n |
| // elements is n-1). We have already written one element, write the |
| // remaining numLiterals - 2 elements here |
| utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output); |
| } |
| } |
| |
| private void writePatchedBaseValues() throws IOException { |
| |
| // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding |
| // because patch is applied to MSB bits. For example: If fixed bit width of |
| // base value is 7 bits and if patch is 3 bits, the actual value is |
| // constructed by shifting the patch to left by 7 positions. |
| // actual_value = patch << 7 | base_value |
| // So, if we align base_value then actual_value can not be reconstructed. |
| |
| // write the number of fixed bits required in next 5 bits |
| final int fb = brBits95p; |
| final int efb = utils.encodeBitWidth(fb) << 1; |
| |
| // adjust variable run length, they are one off |
| variableRunLength -= 1; |
| |
| // extract the 9th bit of run length |
| final int tailBits = (variableRunLength & 0x100) >>> 8; |
| |
| // create first byte of the header |
| final int headerFirstByte = getOpcode() | efb | tailBits; |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| final int headerSecondByte = variableRunLength & 0xff; |
| |
| // if the min value is negative toggle the sign |
| final boolean isNegative = min < 0 ? true : false; |
| if (isNegative) { |
| min = -min; |
| } |
| |
| // find the number of bytes required for base and shift it by 5 bits |
| // to accommodate patch width. The additional bit is used to store the sign |
| // of the base value. |
| final int baseWidth = utils.findClosestNumBits(min) + 1; |
| final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; |
| final int bb = (baseBytes - 1) << 5; |
| |
| // if the base value is negative then set MSB to 1 |
| if (isNegative) { |
| min |= (1L << ((baseBytes * 8) - 1)); |
| } |
| |
| // third byte contains 3 bits for number of bytes occupied by base |
| // and 5 bits for patchWidth |
| final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth); |
| |
| // fourth byte contains 3 bits for page gap width and 5 bits for |
| // patch length |
| final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength; |
| |
| // write header |
| output.write(headerFirstByte); |
| output.write(headerSecondByte); |
| output.write(headerThirdByte); |
| output.write(headerFourthByte); |
| |
| // write the base value using fixed bytes in big endian order |
| for(int i = baseBytes - 1; i >= 0; i--) { |
| byte b = (byte) ((min >>> (i * 8)) & 0xff); |
| output.write(b); |
| } |
| |
| // base reduced literals are bit packed |
| int closestFixedBits = utils.getClosestFixedBits(fb); |
| |
| utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits, |
| output); |
| |
| // write patch list |
| closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth); |
| |
| utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits, |
| output); |
| |
| // reset run length |
| variableRunLength = 0; |
| } |
| |
| /** |
| * Store the opcode in 2 MSB bits |
| * @return opcode |
| */ |
| private int getOpcode() { |
| return encoding.ordinal() << 6; |
| } |
| |
| private void writeDirectValues() throws IOException { |
| |
| // write the number of fixed bits required in next 5 bits |
| int fb = zzBits100p; |
| |
| if (alignedBitpacking) { |
| fb = utils.getClosestAlignedFixedBits(fb); |
| } |
| |
| final int efb = utils.encodeBitWidth(fb) << 1; |
| |
| // adjust variable run length |
| variableRunLength -= 1; |
| |
| // extract the 9th bit of run length |
| final int tailBits = (variableRunLength & 0x100) >>> 8; |
| |
| // create first byte of the header |
| final int headerFirstByte = getOpcode() | efb | tailBits; |
| |
| // second byte of the header stores the remaining 8 bits of runlength |
| final int headerSecondByte = variableRunLength & 0xff; |
| |
| // write header |
| output.write(headerFirstByte); |
| output.write(headerSecondByte); |
| |
| // bit packing the zigzag encoded literals |
| utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output); |
| |
| // reset run length |
| variableRunLength = 0; |
| } |
| |
| private void writeShortRepeatValues() throws IOException { |
| // get the value that is repeating, compute the bits and bytes required |
| long repeatVal = 0; |
| if (signed) { |
| repeatVal = utils.zigzagEncode(literals[0]); |
| } else { |
| repeatVal = literals[0]; |
| } |
| |
| final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal); |
| final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3 |
| : (numBitsRepeatVal >>> 3) + 1; |
| |
| // write encoding type in top 2 bits |
| int header = getOpcode(); |
| |
| // write the number of bytes required for the value |
| header |= ((numBytesRepeatVal - 1) << 3); |
| |
| // write the run length |
| fixedRunLength -= MIN_REPEAT; |
| header |= fixedRunLength; |
| |
| // write the header |
| output.write(header); |
| |
| // write the repeating value in big endian byte order |
| for(int i = numBytesRepeatVal - 1; i >= 0; i--) { |
| int b = (int) ((repeatVal >>> (i * 8)) & 0xff); |
| output.write(b); |
| } |
| |
| fixedRunLength = 0; |
| } |
| |
| private void determineEncoding() { |
| |
| // we need to compute zigzag values for DIRECT encoding if we decide to |
| // break early for delta overflows or for shorter runs |
| computeZigZagLiterals(); |
| |
| zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0); |
| |
| // not a big win for shorter runs to determine encoding |
| if (numLiterals <= MIN_REPEAT) { |
| encoding = EncodingType.DIRECT; |
| return; |
| } |
| |
| // DELTA encoding check |
| |
| // for identifying monotonic sequences |
| boolean isIncreasing = true; |
| boolean isDecreasing = true; |
| this.isFixedDelta = true; |
| |
| this.min = literals[0]; |
| long max = literals[0]; |
| final long initialDelta = literals[1] - literals[0]; |
| long currDelta = 0; |
| long deltaMax = 0; |
| this.adjDeltas[0] = initialDelta; |
| |
| for (int i = 1; i < numLiterals; i++) { |
| final long l1 = literals[i]; |
| final long l0 = literals[i - 1]; |
| currDelta = l1 - l0; |
| min = Math.min(min, l1); |
| max = Math.max(max, l1); |
| |
| isIncreasing &= (l0 <= l1); |
| isDecreasing &= (l0 >= l1); |
| |
| isFixedDelta &= (currDelta == initialDelta); |
| if (i > 1) { |
| adjDeltas[i - 1] = Math.abs(currDelta); |
| deltaMax = Math.max(deltaMax, adjDeltas[i - 1]); |
| } |
| } |
| |
| // its faster to exit under delta overflow condition without checking for |
| // PATCHED_BASE condition as encoding using DIRECT is faster and has less |
| // overhead than PATCHED_BASE |
| if (!utils.isSafeSubtract(max, min)) { |
| encoding = EncodingType.DIRECT; |
| return; |
| } |
| |
| // invariant - subtracting any number from any other in the literals after |
| // this point won't overflow |
| |
| // if min is equal to max then the delta is 0, this condition happens for |
| // fixed values run >10 which cannot be encoded with SHORT_REPEAT |
| if (min == max) { |
| assert isFixedDelta : min + "==" + max + |
| ", isFixedDelta cannot be false"; |
| assert currDelta == 0 : min + "==" + max + ", currDelta should be zero"; |
| fixedDelta = 0; |
| encoding = EncodingType.DELTA; |
| return; |
| } |
| |
| if (isFixedDelta) { |
| assert currDelta == initialDelta |
| : "currDelta should be equal to initialDelta for fixed delta encoding"; |
| encoding = EncodingType.DELTA; |
| fixedDelta = currDelta; |
| return; |
| } |
| |
| // if initialDelta is 0 then we cannot delta encode as we cannot identify |
| // the sign of deltas (increasing or decreasing) |
| if (initialDelta != 0) { |
| // stores the number of bits required for packing delta blob in |
| // delta encoding |
| bitsDeltaMax = utils.findClosestNumBits(deltaMax); |
| |
| // monotonic condition |
| if (isIncreasing || isDecreasing) { |
| encoding = EncodingType.DELTA; |
| return; |
| } |
| } |
| |
| // PATCHED_BASE encoding check |
| |
| // percentile values are computed for the zigzag encoded values. if the |
| // number of bit requirement between 90th and 100th percentile varies |
| // beyond a threshold then we need to patch the values. if the variation |
| // is not significant then we can use direct encoding |
| |
| zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9); |
| int diffBitsLH = zzBits100p - zzBits90p; |
| |
| // if the difference between 90th percentile and 100th percentile fixed |
| // bits is > 1 then we need patch the values |
| if (diffBitsLH > 1) { |
| |
| // patching is done only on base reduced values. |
| // remove base from literals |
| for (int i = 0; i < numLiterals; i++) { |
| baseRedLiterals[i] = literals[i] - min; |
| } |
| |
| // 95th percentile width is used to determine max allowed value |
| // after which patching will be done |
| brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95); |
| |
| // 100th percentile is used to compute the max patch width |
| brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0); |
| |
| // after base reducing the values, if the difference in bits between |
| // 95th percentile and 100th percentile value is zero then there |
| // is no point in patching the values, in which case we will |
| // fallback to DIRECT encoding. |
| // The decision to use patched base was based on zigzag values, but the |
| // actual patching is done on base reduced literals. |
| if ((brBits100p - brBits95p) != 0 && Math.abs(min) < BASE_VALUE_LIMIT) { |
| encoding = EncodingType.PATCHED_BASE; |
| preparePatchedBlob(); |
| return; |
| } else { |
| encoding = EncodingType.DIRECT; |
| return; |
| } |
| } else { |
| // if difference in bits between 95th percentile and 100th percentile is |
| // 0, then patch length will become 0. Hence we will fallback to direct |
| encoding = EncodingType.DIRECT; |
| return; |
| } |
| } |
| |
| private void computeZigZagLiterals() { |
| // populate zigzag encoded literals |
| long zzEncVal = 0; |
| for (int i = 0; i < numLiterals; i++) { |
| if (signed) { |
| zzEncVal = utils.zigzagEncode(literals[i]); |
| } else { |
| zzEncVal = literals[i]; |
| } |
| zigzagLiterals[i] = zzEncVal; |
| } |
| } |
| |
| private void preparePatchedBlob() { |
| // mask will be max value beyond which patch will be generated |
| long mask = (1L << brBits95p) - 1; |
| |
| // since we are considering only 95 percentile, the size of gap and |
| // patch array can contain only be 5% values |
| patchLength = (int) Math.ceil((numLiterals * 0.05)); |
| |
| int[] gapList = new int[patchLength]; |
| long[] patchList = new long[patchLength]; |
| |
| // #bit for patch |
| patchWidth = brBits100p - brBits95p; |
| patchWidth = utils.getClosestFixedBits(patchWidth); |
| |
| // if patch bit requirement is 64 then it will not possible to pack |
| // gap and patch together in a long. To make sure gap and patch can be |
| // packed together adjust the patch width |
| if (patchWidth == 64) { |
| patchWidth = 56; |
| brBits95p = 8; |
| mask = (1L << brBits95p) - 1; |
| } |
| |
| int gapIdx = 0; |
| int patchIdx = 0; |
| int prev = 0; |
| int gap = 0; |
| int maxGap = 0; |
| |
| for(int i = 0; i < numLiterals; i++) { |
| // if value is above mask then create the patch and record the gap |
| if (baseRedLiterals[i] > mask) { |
| gap = i - prev; |
| if (gap > maxGap) { |
| maxGap = gap; |
| } |
| |
| // gaps are relative, so store the previous patched value index |
| prev = i; |
| gapList[gapIdx++] = gap; |
| |
| // extract the most significant bits that are over mask bits |
| long patch = baseRedLiterals[i] >>> brBits95p; |
| patchList[patchIdx++] = patch; |
| |
| // strip off the MSB to enable safe bit packing |
| baseRedLiterals[i] &= mask; |
| } |
| } |
| |
| // adjust the patch length to number of entries in gap list |
| patchLength = gapIdx; |
| |
| // if the element to be patched is the first and only element then |
| // max gap will be 0, but to store the gap as 0 we need atleast 1 bit |
| if (maxGap == 0 && patchLength != 0) { |
| patchGapWidth = 1; |
| } else { |
| patchGapWidth = utils.findClosestNumBits(maxGap); |
| } |
| |
| // special case: if the patch gap width is greater than 256, then |
| // we need 9 bits to encode the gap width. But we only have 3 bits in |
| // header to record the gap width. To deal with this case, we will save |
| // two entries in patch list in the following way |
| // 256 gap width => 0 for patch value |
| // actual gap - 256 => actual patch value |
| // We will do the same for gap width = 511. If the element to be patched is |
| // the last element in the scope then gap width will be 511. In this case we |
| // will have 3 entries in the patch list in the following way |
| // 255 gap width => 0 for patch value |
| // 255 gap width => 0 for patch value |
| // 1 gap width => actual patch value |
| if (patchGapWidth > 8) { |
| patchGapWidth = 8; |
| // for gap = 511, we need two additional entries in patch list |
| if (maxGap == 511) { |
| patchLength += 2; |
| } else { |
| patchLength += 1; |
| } |
| } |
| |
| // create gap vs patch list |
| gapIdx = 0; |
| patchIdx = 0; |
| gapVsPatchList = new long[patchLength]; |
| for(int i = 0; i < patchLength; i++) { |
| long g = gapList[gapIdx++]; |
| long p = patchList[patchIdx++]; |
| while (g > 255) { |
| gapVsPatchList[i++] = (255L << patchWidth); |
| g -= 255; |
| } |
| |
| // store patch value in LSBs and gap in MSBs |
| gapVsPatchList[i] = (g << patchWidth) | p; |
| } |
| } |
| |
| /** |
| * clears all the variables |
| */ |
| private void clear() { |
| numLiterals = 0; |
| encoding = null; |
| prevDelta = 0; |
| fixedDelta = 0; |
| zzBits90p = 0; |
| zzBits100p = 0; |
| brBits95p = 0; |
| brBits100p = 0; |
| bitsDeltaMax = 0; |
| patchGapWidth = 0; |
| patchLength = 0; |
| patchWidth = 0; |
| gapVsPatchList = null; |
| min = 0; |
| isFixedDelta = true; |
| } |
| |
| @Override |
| public void flush() throws IOException { |
| if (numLiterals != 0) { |
| if (variableRunLength != 0) { |
| determineEncoding(); |
| writeValues(); |
| } else if (fixedRunLength != 0) { |
| if (fixedRunLength < MIN_REPEAT) { |
| variableRunLength = fixedRunLength; |
| fixedRunLength = 0; |
| determineEncoding(); |
| writeValues(); |
| } else if (fixedRunLength >= MIN_REPEAT |
| && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { |
| encoding = EncodingType.SHORT_REPEAT; |
| writeValues(); |
| } else { |
| encoding = EncodingType.DELTA; |
| isFixedDelta = true; |
| writeValues(); |
| } |
| } |
| } |
| output.flush(); |
| } |
| |
| @Override |
| public void write(long val) throws IOException { |
| if (numLiterals == 0) { |
| initializeLiterals(val); |
| } else { |
| if (numLiterals == 1) { |
| prevDelta = val - literals[0]; |
| literals[numLiterals++] = val; |
| // if both values are same count as fixed run else variable run |
| if (val == literals[0]) { |
| fixedRunLength = 2; |
| variableRunLength = 0; |
| } else { |
| fixedRunLength = 0; |
| variableRunLength = 2; |
| } |
| } else { |
| long currentDelta = val - literals[numLiterals - 1]; |
| if (prevDelta == 0 && currentDelta == 0) { |
| // fixed delta run |
| |
| literals[numLiterals++] = val; |
| |
| // if variable run is non-zero then we are seeing repeating |
| // values at the end of variable run in which case keep |
| // updating variable and fixed runs |
| if (variableRunLength > 0) { |
| fixedRunLength = 2; |
| } |
| fixedRunLength += 1; |
| |
| // if fixed run met the minimum condition and if variable |
| // run is non-zero then flush the variable run and shift the |
| // tail fixed runs to start of the buffer |
| if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { |
| numLiterals -= MIN_REPEAT; |
| variableRunLength -= MIN_REPEAT - 1; |
| // copy the tail fixed runs |
| long[] tailVals = new long[MIN_REPEAT]; |
| System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT); |
| |
| // determine variable encoding and flush values |
| determineEncoding(); |
| writeValues(); |
| |
| // shift tail fixed runs to beginning of the buffer |
| for(long l : tailVals) { |
| literals[numLiterals++] = l; |
| } |
| } |
| |
| // if fixed runs reached max repeat length then write values |
| if (fixedRunLength == MAX_SCOPE) { |
| determineEncoding(); |
| writeValues(); |
| } |
| } else { |
| // variable delta run |
| |
| // if fixed run length is non-zero and if it satisfies the |
| // short repeat conditions then write the values as short repeats |
| // else use delta encoding |
| if (fixedRunLength >= MIN_REPEAT) { |
| if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { |
| encoding = EncodingType.SHORT_REPEAT; |
| writeValues(); |
| } else { |
| encoding = EncodingType.DELTA; |
| isFixedDelta = true; |
| writeValues(); |
| } |
| } |
| |
| // if fixed run length is <MIN_REPEAT and current value is |
| // different from previous then treat it as variable run |
| if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT) { |
| if (val != literals[numLiterals - 1]) { |
| variableRunLength = fixedRunLength; |
| fixedRunLength = 0; |
| } |
| } |
| |
| // after writing values re-initialize the variables |
| if (numLiterals == 0) { |
| initializeLiterals(val); |
| } else { |
| // keep updating variable run lengths |
| prevDelta = val - literals[numLiterals - 1]; |
| literals[numLiterals++] = val; |
| variableRunLength += 1; |
| |
| // if variable run length reach the max scope, write it |
| if (variableRunLength == MAX_SCOPE) { |
| determineEncoding(); |
| writeValues(); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| private void initializeLiterals(long val) { |
| literals[numLiterals++] = val; |
| fixedRunLength = 1; |
| variableRunLength = 1; |
| } |
| |
| @Override |
| public void getPosition(PositionRecorder recorder) throws IOException { |
| output.getPosition(recorder); |
| recorder.addPosition(numLiterals); |
| } |
| |
| @Override |
| public long estimateMemory() { |
| return output.getBufferSize(); |
| } |
| |
| @Override |
| public void changeIv(Consumer<byte[]> modifier) { |
| output.changeIv(modifier); |
| } |
| } |