blob: f173ab1bd52d4a08699721153c88cb6e0212f94b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.cpc;
import static org.apache.datasketches.cpc.CompressionData.columnPermutationsForDecoding;
import static org.apache.datasketches.cpc.CompressionData.columnPermutationsForEncoding;
import static org.apache.datasketches.cpc.CompressionData.decodingTablesForHighEntropyByte;
import static org.apache.datasketches.cpc.CompressionData.encodingTablesForHighEntropyByte;
import static org.apache.datasketches.cpc.CompressionData.lengthLimitedUnaryDecodingTable65;
import static org.apache.datasketches.cpc.CompressionData.lengthLimitedUnaryEncodingTable65;
import static org.apache.datasketches.cpc.PairTable.introspectiveInsertionSort;
//import static org.apache.datasketches.cpc.RuntimeAsserts.rtAssertEquals;
/**
* @author Lee Rhodes
* @author Kevin Lang
*/
final class CpcCompression {
//visible for test
static final int NEXT_WORD_IDX = 0; //ptrArr[NEXT_WORD_IDX]
static final int BIT_BUF = 1; //ptrArr[BIT_BUF]
static final int BUF_BITS = 2; //ptrArr[BUF_BITS]
//visible for test
static void writeUnary(
final int[] compressedWords,
final long[] ptrArr,
final int theValue) {
int nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; //must be int
assert (nextWordIndex == ptrArr[NEXT_WORD_IDX]); //catch truncation error
long bitBuf = ptrArr[BIT_BUF]; //must be long
int bufBits = (int) ptrArr[BUF_BITS]; //could be byte
assert (compressedWords != null);
assert (nextWordIndex >= 0);
assert (bitBuf >= 0);
assert ((bufBits >= 0) && (bufBits <= 31));
int remaining = theValue;
while (remaining >= 16) {
remaining -= 16;
// Here we output 16 zeros, but we don't need to physically write them into bitbuf
// because it already contains zeros in that region.
bufBits += 16; // Record the fact that 16 bits of output have occurred.
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
}
assert (remaining >= 0) && (remaining <= 15);
final long theUnaryCode = 1L << remaining; //must be a long
bitBuf |= theUnaryCode << bufBits;
bufBits += (1 + remaining);
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
ptrArr[NEXT_WORD_IDX] = nextWordIndex;
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error
ptrArr[BIT_BUF] = bitBuf;
ptrArr[BUF_BITS] = bufBits;
}
//visible for test
static long readUnary(
final int[] compressedWords,
final long[] ptrArr) {
int nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; //must be int
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error
long bitBuf = ptrArr[BIT_BUF];
int bufBits = (int) ptrArr[BUF_BITS];
assert compressedWords != null;
assert nextWordIndex >= 0;
assert bitBuf >= 0;
assert bufBits >= 0;
long subTotal = 0;
int trailingZeros;
//readUnaryLoop:
while (true) {
//MAYBE_FILL_BITBUF(compressedWords,nextWordIndex,8); // ensure 8 bits in bit buffer
if (bufBits < 8) { // Prepare for an 8-bit peek into the bitstream.
bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits);
bufBits += 32;
}
// These 8 bits include either all or part of the Unary codeword.
final int peek8 = (int) (bitBuf & 0XFFL);
trailingZeros = Math.min(8, Integer.numberOfTrailingZeros(peek8));
assert ((trailingZeros >= 0) && (trailingZeros <= 8)) : "TZ+ " + trailingZeros;
if (trailingZeros == 8) { // The codeword was partial, so read some more.
subTotal += 8;
bufBits -= 8;
bitBuf >>>= 8;
continue;
}
break;
}
bufBits -= (1 + trailingZeros);
bitBuf >>>= (1 + trailingZeros);
ptrArr[NEXT_WORD_IDX] = nextWordIndex;
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error
ptrArr[BIT_BUF] = bitBuf;
ptrArr[BUF_BITS] = bufBits;
return subTotal + trailingZeros;
}
/**
* This returns the number of compressedWords that were actually used.
* @param byteArray input
* @param numBytesToEncode input
* @param encodingTable input
* @param compressedWords output
* @return the number of compressedWords that were actually used.
*/
//visible for test
//It is the caller's responsibility to ensure that the compressedWords array is long enough.
static int lowLevelCompressBytes(
final byte[] byteArray, // input
final int numBytesToEncode, // input, must be an int
final short[] encodingTable, // input
final int[] compressedWords) { // output
int nextWordIndex = 0;
long bitBuf = 0; // bits are packed into this first, then are flushed to compressedWords
int bufBits = 0; // number of bits currently in bitbuf; must be between 0 and 31
for (int byteIndex = 0; byteIndex < numBytesToEncode; byteIndex++) {
final int theByte = byteArray[byteIndex] & 0XFF;
final long codeInfo = (encodingTable[theByte] & 0XFFFFL);
final long codeVal = codeInfo & 0XFFFL;
final int codeWordLength = (int) (codeInfo >>> 12);
bitBuf |= (codeVal << bufBits);
bufBits += codeWordLength;
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
}
//Pad the bitstream with 11 zero-bits so that the decompressor's 12-bit peek
// can't overrun its input.
bufBits += 11;
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
if (bufBits > 0) { // We are done encoding now, so we flush the bit buffer.
assert (bufBits < 32);
compressedWords[nextWordIndex++] = (int) bitBuf;
}
return nextWordIndex;
}
//visible for test
static void lowLevelUncompressBytes(
final byte[] byteArray, // output
final int numBytesToDecode, // input (but refers to the output)
final short[] decodingTable, // input
final int[] compressedWords, // input
final long numCompressedWords) { // input
int byteIndex = 0;
int nextWordIndex = 0;
long bitBuf = 0;
int bufBits = 0;
assert (byteArray != null);
assert (decodingTable != null);
assert (compressedWords != null);
for (byteIndex = 0; byteIndex < numBytesToDecode; byteIndex++) {
//MAYBE_FILL_BITBUF(compressedWords,wordIndex,12); // ensure 12 bits in bit buffer
if (bufBits < 12) { // Prepare for a 12-bit peek into the bitstream.
bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits);
bufBits += 32;
}
// These 12 bits will include an entire Huffman codeword.
final int peek12 = (int) (bitBuf & 0XFFFL);
final int lookup = decodingTable[peek12] & 0XFFFF;
final int codeWordLength = lookup >>> 8;
final byte decodedByte = (byte) (lookup & 0XFF);
byteArray[byteIndex] = decodedByte;
bitBuf >>>= codeWordLength;
bufBits -= codeWordLength;
}
// Buffer over-run should be impossible unless there is a bug.
// However, we might as well check here.
assert (nextWordIndex <= numCompressedWords);
}
/**
* Here "pairs" refers to row/column pairs that specify the positions of surprising values in
* the bit matrix.
* @param pairArray input
* @param numPairsToEncode input
* @param numBaseBits input
* @param compressedWords output
* @return the number of compressedWords actually used
*/
//visible for test
static long lowLevelCompressPairs(
final int[] pairArray, // input
final int numPairsToEncode, // input
final int numBaseBits, // input //cannot exceed 63 or 6 bits, could be byte
final int[] compressedWords) { // output
int pairIndex = 0;
final long[] ptrArr = new long[3];
int nextWordIndex = 0; //must be int
long bitBuf = 0; //must be long
int bufBits = 0; //could be byte
final long golombLoMask = (1L << numBaseBits) - 1L;
int predictedRowIndex = 0;
int predictedColIndex = 0;
for (pairIndex = 0; pairIndex < numPairsToEncode; pairIndex++) {
final int rowCol = pairArray[pairIndex];
final int rowIndex = rowCol >>> 6;
final int colIndex = rowCol & 0X3F; //63
if (rowIndex != predictedRowIndex) { predictedColIndex = 0; }
assert (rowIndex >= predictedRowIndex);
assert (colIndex >= predictedColIndex);
final long yDelta = rowIndex - predictedRowIndex; //cannot exceed 2^26
final int xDelta = colIndex - predictedColIndex; //cannot exceed 65
predictedRowIndex = rowIndex;
predictedColIndex = colIndex + 1;
final long codeInfo = lengthLimitedUnaryEncodingTable65[xDelta] & 0XFFFFL;
final long codeVal = codeInfo & 0XFFFL;
final int codeLen = (int) (codeInfo >>> 12);
bitBuf |= (codeVal << bufBits);
bufBits += codeLen;
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
final long golombLo = yDelta & golombLoMask; //long for bitBuf
final long golombHi = yDelta >>> numBaseBits; //cannot exceed 2^26
//TODO Inline WriteUnary
ptrArr[NEXT_WORD_IDX] = nextWordIndex;
ptrArr[BIT_BUF] = bitBuf;
ptrArr[BUF_BITS] = bufBits;
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error
writeUnary(compressedWords, ptrArr, (int) golombHi);
nextWordIndex = (int) ptrArr[NEXT_WORD_IDX];
bitBuf = ptrArr[BIT_BUF];
bufBits = (int) ptrArr[BUF_BITS];
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error
//END Inline WriteUnary
bitBuf |= golombLo << bufBits;
bufBits += numBaseBits;
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
}
// Pad the bitstream so that the decompressor's 12-bit peek can't overrun its input.
int padding = 10 - numBaseBits;
if (padding < 0) { padding = 0; }
bufBits += padding;
//MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex);
if (bufBits >= 32) {
compressedWords[nextWordIndex++] = (int) bitBuf;
bitBuf >>>= 32;
bufBits -= 32;
}
if (bufBits > 0) { // We are done encoding now, so we flush the bit buffer.
assert (bufBits < 32);
compressedWords[nextWordIndex++] = (int) bitBuf;
//bitBuf = 0;
//bufBits = 0; // not really necessary
}
return nextWordIndex;
}
//visible for test
static void lowLevelUncompressPairs(
final int[] pairArray, // output
final int numPairsToDecode, // input, size of output, must be int
final int numBaseBits, // input, cannot exceed 6 bits
final int[] compressedWords, // input
final long numCompressedWords) { // input
int pairIndex = 0;
final long[] ptrArr = new long[3];
int nextWordIndex = 0;
long bitBuf = 0;
int bufBits = 0;
final long golombLoMask = (1L << numBaseBits) - 1L;
int predictedRowIndex = 0;
int predictedColIndex = 0;
// for each pair we need to read:
// xDelta (12-bit length-limited unary)
// yDeltaHi (unary)
// yDeltaLo (basebits)
for (pairIndex = 0; pairIndex < numPairsToDecode; pairIndex++) {
//MAYBE_FILL_BITBUF(compressedWords,wordIndex,12); // ensure 12 bits in bit buffer
if (bufBits < 12) { // Prepare for a 12-bit peek into the bitstream.
bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits);
bufBits += 32;
}
final int peek12 = (int) (bitBuf & 0XFFFL);
final int lookup = lengthLimitedUnaryDecodingTable65[peek12] & 0XFFFF;
final int codeWordLength = lookup >>> 8;
final int xDelta = lookup & 0XFF;
bitBuf >>>= codeWordLength;
bufBits -= codeWordLength;
//TODO Inline ReadUnary
ptrArr[NEXT_WORD_IDX] = nextWordIndex;
ptrArr[BIT_BUF] = bitBuf;
ptrArr[BUF_BITS] = bufBits;
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error
final long golombHi = readUnary(compressedWords, ptrArr);
nextWordIndex = (int) ptrArr[NEXT_WORD_IDX];
bitBuf = ptrArr[BIT_BUF];
bufBits = (int) ptrArr[BUF_BITS];
assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error
//END Inline ReadUnary
//MAYBE_FILL_BITBUF(compressedWords,wordIndex,numBaseBits); // ensure numBaseBits in bit buffer
if (bufBits < numBaseBits) { // Prepare for a numBaseBits peek into the bitstream.
bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits);
bufBits += 32;
}
final long golombLo = bitBuf & golombLoMask;
bitBuf >>>= numBaseBits;
bufBits -= numBaseBits;
final long yDelta = (golombHi << numBaseBits) | golombLo;
// Now that we have yDelta and xDelta, we can compute the pair's row and column.
if (yDelta > 0) { predictedColIndex = 0; }
final int rowIndex = predictedRowIndex + (int) yDelta;
final int colIndex = predictedColIndex + xDelta;
final int rowCol = (rowIndex << 6) | colIndex;
pairArray[pairIndex] = rowCol;
predictedRowIndex = rowIndex;
predictedColIndex = colIndex + 1;
}
// check for buffer over-run
assert (nextWordIndex <= numCompressedWords)
: "nextWdIdx: " + nextWordIndex + ", #CompWds: " + numCompressedWords;
}
private static int safeLengthForCompressedPairBuf(
final long k, final long numPairs, final long numBaseBits) {
assert (numPairs > 0);
// long ybits = k + numPairs; // simpler and safer UB
// The following tighter UB on ybits is based on page 198
// of the textbook "Managing Gigabytes" by Witten, Moffat, and Bell.
// Notice that if numBaseBits == 0 it coincides with (k + numPairs).
final long ybits = (numPairs * (1L + numBaseBits)) + (k >>> numBaseBits);
final long xbits = 12 * numPairs;
long padding = 10L - numBaseBits;
if (padding < 0) { padding = 0; }
final long bits = xbits + ybits + padding;
//final long words = divideLongsRoundingUp(bits, 32);
final long words = CpcCompression.divideBy32RoundingUp(bits);
assert words < (1L << 31);
return (int) words;
}
// Explanation of padding: we write
// 1) xdelta (huffman, provides at least 1 bit, requires 12-bit lookahead)
// 2) ydeltaGolombHi (unary, provides at least 1 bit, requires 8-bit lookahead)
// 3) ydeltaGolombLo (straight B bits).
// So the 12-bit lookahead is the tight constraint, but there are at least (2 + B) bits emitted,
// so we would be safe with max (0, 10 - B) bits of padding at the end of the bitstream.
private static int safeLengthForCompressedWindowBuf(final long k) { // measured in 32-bit words
// 11 bits of padding, due to 12-bit lookahead, with 1 bit certainly present.
final long bits = (12 * k) + 11;
//cannot exceed Integer.MAX_VALUE
//return (int) (divideLongsRoundingUp(bits, 32));
return (int) CpcCompression.divideBy32RoundingUp(bits);
}
private static int determinePseudoPhase(final int lgK, final long numCoupons) {
final long k = 1L << lgK;
final long c = numCoupons;
// This midrange logic produces pseudo-phases. They are used to select encoding tables.
// The thresholds were chosen by hand after looking at plots of measured compression.
if ((1000 * c) < (2375 * k)) {
if ( (4 * c) < (3 * k)) { return ( 16 + 0 ); } // midrange table
else if ( (10 * c) < (11 * k)) { return ( 16 + 1 ); } // midrange table
else if ( (100 * c) < (132 * k)) { return ( 16 + 2 ); } // midrange table
else if ( (3 * c) < (5 * k)) { return ( 16 + 3 ); } // midrange table
else if ((1000 * c) < (1965 * k)) { return ( 16 + 4 ); } // midrange table
else if ((1000 * c) < (2275 * k)) { return ( 16 + 5 ); } // midrange table
else { return 6; } // steady-state table employed before its actual phase
}
else {
// This steady-state logic produces true phases. They are used to select
// encoding tables, and also column permutations for the "Sliding" flavor.
assert lgK >= 4;
final long tmp = c >>> (lgK - 4);
final int phase = (int) (tmp & 15L);
assert (phase >= 0) && (phase < 16);
return phase;
}
}
private static void compressTheWindow(final CompressedState target, final CpcSketch source) {
final int srcLgK = source.lgK;
final int srcK = 1 << srcLgK;
final int windowBufLen = safeLengthForCompressedWindowBuf(srcK);
final int[] windowBuf = new int[windowBufLen];
final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons);
target.cwLengthInts = lowLevelCompressBytes(
source.slidingWindow,
srcK,
encodingTablesForHighEntropyByte[pseudoPhase],
windowBuf);
// At this point we free the unused portion of the compression output buffer.
// final int[] shorterBuf = Arrays.copyOf(windowBuf, target.cwLength);
// target.compressedWindow = shorterBuf;
target.cwStream = windowBuf; //avoid extra copy
}
private static void uncompressTheWindow(final CpcSketch target, final CompressedState source) {
final int srcLgK = source.lgK;
final int srcK = 1 << srcLgK;
final byte[] window = new byte[srcK];
// bzero ((void *) window, (size_t) k); // zeroing not needed here (unlike the Hybrid Flavor)
assert (target.slidingWindow == null);
target.slidingWindow = window;
final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons);
assert (source.cwStream != null);
lowLevelUncompressBytes(target.slidingWindow, srcK,
decodingTablesForHighEntropyByte[pseudoPhase],
source.cwStream,
source.cwLengthInts);
}
private static void compressTheSurprisingValues(final CompressedState target, final CpcSketch source,
final int[] pairs, final int numPairs) {
assert (numPairs > 0);
target.numCsv = numPairs;
final int srcK = 1 << source.lgK;
final int numBaseBits = CpcCompression.golombChooseNumberOfBaseBits(srcK + numPairs, numPairs);
final int pairBufLen = safeLengthForCompressedPairBuf(srcK, numPairs, numBaseBits);
final int[] pairBuf = new int[pairBufLen];
target.csvLengthInts = (int) lowLevelCompressPairs(pairs, numPairs, numBaseBits, pairBuf);
// At this point we free the unused portion of the compression output buffer.
// final int[] shorterBuf = Arrays.copyOf(pairBuf, target.csvLength);
// target.compressedWindow = shorterBuf;
target.csvStream = pairBuf; //avoid extra copy
}
//allocates and returns an array of uncompressed pairs.
//the length of this array is known to the source sketch.
private static int[] uncompressTheSurprisingValues(final CompressedState source) {
final int srcK = 1 << source.lgK;
final int numPairs = source.numCsv;
assert numPairs > 0;
final int[] pairs = new int[numPairs];
final int numBaseBits = CpcCompression.golombChooseNumberOfBaseBits(srcK + numPairs, numPairs);
lowLevelUncompressPairs(pairs, numPairs, numBaseBits, source.csvStream, source.csvLengthInts);
return pairs;
}
private static void compressSparseFlavor(final CompressedState target, final CpcSketch source) {
assert (source.slidingWindow == null); //there is no window to compress
final PairTable srcPairTable = source.pairTable;
final int srcNumPairs = srcPairTable.getNumPairs();
final int[] srcPairArr = PairTable.unwrappingGetItems(srcPairTable, srcNumPairs);
introspectiveInsertionSort(srcPairArr, 0, srcNumPairs - 1);
compressTheSurprisingValues(target, source, srcPairArr, srcNumPairs);
}
private static void uncompressSparseFlavor(final CpcSketch target, final CompressedState source) {
assert (source.cwStream == null);
assert (source.csvStream != null);
final int[] srcPairArr = uncompressTheSurprisingValues(source);
final int numPairs = source.numCsv;
final PairTable table = PairTable.newInstanceFromPairsArray(srcPairArr, numPairs, source.lgK);
target.pairTable = table;
}
//The empty space that this leaves at the beginning of the output array
// will be filled in later by the caller.
private static int[] trickyGetPairsFromWindow(final byte[] window, final int k, final int numPairsToGet,
final int emptySpace) {
final int outputLength = emptySpace + numPairsToGet;
final int[] pairs = new int[outputLength];
int rowIndex = 0;
int pairIndex = emptySpace;
for (rowIndex = 0; rowIndex < k; rowIndex++) {
int wByte = window[rowIndex] & 0XFF;
while (wByte != 0) {
final int colIndex = Integer.numberOfTrailingZeros(wByte);
// assert (colIndex < 8);
wByte ^= (1 << colIndex); // erase the 1
pairs[pairIndex++] = (rowIndex << 6) | colIndex;
}
}
assert (pairIndex == outputLength);
return (pairs);
}
//This is complicated because it effectively builds a Sparse version
//of a Pinned sketch before compressing it. Hence the name Hybrid.
private static void compressHybridFlavor(final CompressedState target, final CpcSketch source) {
final int srcK = 1 << source.lgK;
final PairTable srcPairTable = source.pairTable;
final int srcNumPairs = srcPairTable.getNumPairs();
final int[] srcPairArr = PairTable.unwrappingGetItems(srcPairTable, srcNumPairs);
introspectiveInsertionSort(srcPairArr, 0, srcNumPairs - 1);
final byte[] srcSlidingWindow = source.slidingWindow;
final int srcWindowOffset = source.windowOffset;
final long srcNumCoupons = source.numCoupons;
assert (srcSlidingWindow != null);
assert (srcWindowOffset == 0);
final long numPairs = srcNumCoupons - srcNumPairs; // because the window offset is zero
assert numPairs < Integer.MAX_VALUE;
final int numPairsFromArray = (int) numPairs;
assert (numPairsFromArray + srcNumPairs) == srcNumCoupons; //for test
final int[] allPairs
= trickyGetPairsFromWindow(srcSlidingWindow, srcK, numPairsFromArray, srcNumPairs);
PairTable.merge(srcPairArr, 0, srcNumPairs,
allPairs, srcNumPairs, numPairsFromArray,
allPairs, 0); // note the overlapping subarray trick
//FOR TESTING If needed
// for (int i = 0; i < (source.numCoupons - 1); i++) {
// assert (Integer.compareUnsigned(allPairs[i], allPairs[i + 1]) < 0); }
compressTheSurprisingValues(target, source, allPairs, (int) srcNumCoupons);
}
private static void uncompressHybridFlavor(final CpcSketch target, final CompressedState source) {
assert (source.cwStream == null);
assert (source.csvStream != null);
final int[] pairs = uncompressTheSurprisingValues(source); //fail path 3
final int numPairs = source.numCsv;
// In the hybrid flavor, some of these pairs actually
// belong in the window, so we will separate them out,
// moving the "true" pairs to the bottom of the array.
final int srcLgK = source.lgK;
final int k = 1 << srcLgK;
final byte[] window = new byte[k];
int nextTruePair = 0;
for (int i = 0; i < numPairs; i++) {
final int rowCol = pairs[i];
assert (rowCol != -1);
final int col = rowCol & 63;
if (col < 8) {
final int row = rowCol >>> 6;
window[row] |= (1 << col); // set the window bit
}
else {
pairs[nextTruePair++] = rowCol; // move true pair down
}
}
assert (source.getWindowOffset() == 0);
target.windowOffset = 0;
final PairTable table = PairTable.newInstanceFromPairsArray(pairs, nextTruePair, srcLgK);
target.pairTable = table;
target.slidingWindow = window;
}
private static void compressPinnedFlavor(final CompressedState target, final CpcSketch source) {
compressTheWindow(target, source);
final PairTable srcPairTable = source.pairTable;
final int numPairs = srcPairTable.getNumPairs();
if (numPairs > 0) {
final int[] pairs = PairTable.unwrappingGetItems(srcPairTable, numPairs);
// Here we subtract 8 from the column indices. Because they are stored in the low 6 bits
// of each rowCol pair, and because no column index is less than 8 for a "Pinned" sketch,
// I believe we can simply subtract 8 from the pairs themselves.
// shift the columns over by 8 positions before compressing (because of the window)
for (int i = 0; i < numPairs; i++) {
assert (pairs[i] & 63) >= 8;
pairs[i] -= 8;
}
introspectiveInsertionSort(pairs, 0, numPairs - 1);
compressTheSurprisingValues(target, source, pairs, numPairs);
}
}
private static void uncompressPinnedFlavor(final CpcSketch target, final CompressedState source) {
assert (source.cwStream != null);
uncompressTheWindow(target, source);
final int srcLgK = source.lgK;
final int numPairs = source.numCsv;
if (numPairs == 0) {
target.pairTable = new PairTable(2, 6 + srcLgK);
}
else {
assert numPairs > 0;
assert source.csvStream != null;
final int[] pairs = uncompressTheSurprisingValues(source);
// undo the compressor's 8-column shift
for (int i = 0; i < numPairs; i++) {
assert (pairs[i] & 63) < 56;
pairs[i] += 8;
}
final PairTable table = PairTable.newInstanceFromPairsArray(pairs, numPairs, srcLgK);
target.pairTable = table;
}
}
//Complicated by the existence of both a left fringe and a right fringe.
private static void compressSlidingFlavor(final CompressedState target, final CpcSketch source) {
compressTheWindow(target, source);
final PairTable srcPairTable = source.pairTable;
final int numPairs = srcPairTable.getNumPairs();
if (numPairs > 0) {
final int[] pairs = PairTable.unwrappingGetItems(srcPairTable, numPairs);
// Here we apply a complicated transformation to the column indices, which
// changes the implied ordering of the pairs, so we must do it before sorting.
final int pseudoPhase = determinePseudoPhase(source.lgK, source.numCoupons); // NB
assert (pseudoPhase < 16);
final byte[] permutation = columnPermutationsForEncoding[pseudoPhase];
final int offset = source.windowOffset;
assert ((offset > 0) && (offset <= 56));
for (int i = 0; i < numPairs; i++) {
final int rowCol = pairs[i];
final int row = rowCol >>> 6;
int col = (rowCol & 63);
// first rotate the columns into a canonical configuration:
// new = ((old - (offset+8)) + 64) mod 64
col = ((col + 56) - offset) & 63;
assert (col >= 0) && (col < 56);
// then apply the permutation
col = permutation[col];
pairs[i] = (row << 6) | col;
}
introspectiveInsertionSort(pairs, 0, numPairs - 1);
compressTheSurprisingValues(target, source, pairs, numPairs);
}
}
private static void uncompressSlidingFlavor(final CpcSketch target, final CompressedState source) {
assert (source.cwStream != null);
uncompressTheWindow(target, source);
final int srcLgK = source.lgK;
final int numPairs = source.numCsv;
if (numPairs == 0) {
target.pairTable = new PairTable(2, 6 + srcLgK);
}
else {
assert (numPairs > 0);
assert (source.csvStream != null);
final int[] pairs = uncompressTheSurprisingValues(source);
final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons); // NB
assert (pseudoPhase < 16);
final byte[] permutation = columnPermutationsForDecoding[pseudoPhase];
final int offset = source.getWindowOffset();
assert (offset > 0) && (offset <= 56);
for (int i = 0; i < numPairs; i++) {
final int rowCol = pairs[i];
final int row = rowCol >>> 6;
int col = rowCol & 63;
// first undo the permutation
col = permutation[col];
// then undo the rotation: old = (new + (offset+8)) mod 64
col = (col + (offset + 8)) & 63;
pairs[i] = (row << 6) | col;
}
final PairTable table = PairTable.newInstanceFromPairsArray(pairs, numPairs, srcLgK);
target.pairTable = table;
}
}
static CompressedState compress(final CpcSketch source, final CompressedState target) {
final Flavor srcFlavor = source.getFlavor();
switch (srcFlavor) {
case EMPTY: break;
case SPARSE:
compressSparseFlavor(target, source);
assert (target.cwStream == null);
assert (target.csvStream != null);
break;
case HYBRID:
compressHybridFlavor(target, source);
assert (target.cwStream == null);
assert (target.csvStream != null);
break;
case PINNED:
compressPinnedFlavor(target, source);
assert (target.cwStream != null);
break;
case SLIDING:
compressSlidingFlavor(target, source);
assert (target.cwStream != null);
break;
//default: not possible
}
return target;
}
static CpcSketch uncompress(final CompressedState source, final CpcSketch target) {
assert (target != null);
final Flavor srcFlavor = source.getFlavor();
switch (srcFlavor) {
case EMPTY: break;
case SPARSE:
assert (source.cwStream == null);
uncompressSparseFlavor(target, source);
break;
case HYBRID:
uncompressHybridFlavor(target, source);
break;
case PINNED:
assert (source.cwStream != null);
uncompressPinnedFlavor(target, source);
break;
case SLIDING:
uncompressSlidingFlavor(target, source);
break;
//default: not possible
}
return target;
}
private static int golombChooseNumberOfBaseBits(final int k, final long count) {
assert k >= 1L;
assert count >= 1L;
final long quotient = (k - count) / count; // integer division
if (quotient == 0) { return 0; }
return (int) floorLog2ofLong(quotient);
}
private static long floorLog2ofLong(final long x) { //not a good name
assert (x >= 1L);
long p = 0;
long y = 1;
while (true) {
if (y == x) { return p; }
if (y > x) { return p - 1; }
p += 1;
y <<= 1;
}
}
private static long divideBy32RoundingUp(final long x) {
final long tmp = x >>> 5;
return ((tmp << 5) == x) ? tmp : tmp + 1;
}
}