| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.datasketches.cpc; |
| |
| import static org.apache.datasketches.cpc.CompressionData.columnPermutationsForDecoding; |
| import static org.apache.datasketches.cpc.CompressionData.columnPermutationsForEncoding; |
| import static org.apache.datasketches.cpc.CompressionData.decodingTablesForHighEntropyByte; |
| import static org.apache.datasketches.cpc.CompressionData.encodingTablesForHighEntropyByte; |
| import static org.apache.datasketches.cpc.CompressionData.lengthLimitedUnaryDecodingTable65; |
| import static org.apache.datasketches.cpc.CompressionData.lengthLimitedUnaryEncodingTable65; |
| import static org.apache.datasketches.cpc.PairTable.introspectiveInsertionSort; |
| //import static org.apache.datasketches.cpc.RuntimeAsserts.rtAssertEquals; |
| |
| /** |
| * @author Lee Rhodes |
| * @author Kevin Lang |
| */ |
| final class CpcCompression { |
| //visible for test |
| static final int NEXT_WORD_IDX = 0; //ptrArr[NEXT_WORD_IDX] |
| static final int BIT_BUF = 1; //ptrArr[BIT_BUF] |
| static final int BUF_BITS = 2; //ptrArr[BUF_BITS] |
| |
| //visible for test |
| static void writeUnary( |
| final int[] compressedWords, |
| final long[] ptrArr, |
| final int theValue) { |
| |
| int nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; //must be int |
| assert (nextWordIndex == ptrArr[NEXT_WORD_IDX]); //catch truncation error |
| long bitBuf = ptrArr[BIT_BUF]; //must be long |
| int bufBits = (int) ptrArr[BUF_BITS]; //could be byte |
| |
| assert (compressedWords != null); |
| assert (nextWordIndex >= 0); |
| assert (bitBuf >= 0); |
| assert ((bufBits >= 0) && (bufBits <= 31)); |
| |
| int remaining = theValue; |
| |
| while (remaining >= 16) { |
| remaining -= 16; |
| // Here we output 16 zeros, but we don't need to physically write them into bitbuf |
| // because it already contains zeros in that region. |
| bufBits += 16; // Record the fact that 16 bits of output have occurred. |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| } |
| |
| assert (remaining >= 0) && (remaining <= 15); |
| |
| final long theUnaryCode = 1L << remaining; //must be a long |
| bitBuf |= theUnaryCode << bufBits; |
| bufBits += (1 + remaining); |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| |
| ptrArr[NEXT_WORD_IDX] = nextWordIndex; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error |
| ptrArr[BIT_BUF] = bitBuf; |
| ptrArr[BUF_BITS] = bufBits; |
| } |
| |
| //visible for test |
| static long readUnary( |
| final int[] compressedWords, |
| final long[] ptrArr) { |
| |
| int nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; //must be int |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error |
| long bitBuf = ptrArr[BIT_BUF]; |
| int bufBits = (int) ptrArr[BUF_BITS]; |
| |
| assert compressedWords != null; |
| assert nextWordIndex >= 0; |
| assert bitBuf >= 0; |
| assert bufBits >= 0; |
| |
| long subTotal = 0; |
| int trailingZeros; |
| |
| //readUnaryLoop: |
| while (true) { |
| //MAYBE_FILL_BITBUF(compressedWords,nextWordIndex,8); // ensure 8 bits in bit buffer |
| if (bufBits < 8) { // Prepare for an 8-bit peek into the bitstream. |
| bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits); |
| bufBits += 32; |
| } |
| |
| // These 8 bits include either all or part of the Unary codeword. |
| final int peek8 = (int) (bitBuf & 0XFFL); |
| trailingZeros = Math.min(8, Integer.numberOfTrailingZeros(peek8)); |
| |
| assert ((trailingZeros >= 0) && (trailingZeros <= 8)) : "TZ+ " + trailingZeros; |
| |
| if (trailingZeros == 8) { // The codeword was partial, so read some more. |
| subTotal += 8; |
| bufBits -= 8; |
| bitBuf >>>= 8; |
| continue; |
| } |
| break; |
| } |
| |
| bufBits -= (1 + trailingZeros); |
| bitBuf >>>= (1 + trailingZeros); |
| |
| ptrArr[NEXT_WORD_IDX] = nextWordIndex; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error |
| ptrArr[BIT_BUF] = bitBuf; |
| ptrArr[BUF_BITS] = bufBits; |
| return subTotal + trailingZeros; |
| } |
| |
| /** |
| * This returns the number of compressedWords that were actually used. |
| * @param byteArray input |
| * @param numBytesToEncode input |
| * @param encodingTable input |
| * @param compressedWords output |
| * @return the number of compressedWords that were actually used. |
| */ |
| //visible for test |
| //It is the caller's responsibility to ensure that the compressedWords array is long enough. |
| static int lowLevelCompressBytes( |
| final byte[] byteArray, // input |
| final int numBytesToEncode, // input, must be an int |
| final short[] encodingTable, // input |
| final int[] compressedWords) { // output |
| |
| int nextWordIndex = 0; |
| long bitBuf = 0; // bits are packed into this first, then are flushed to compressedWords |
| int bufBits = 0; // number of bits currently in bitbuf; must be between 0 and 31 |
| |
| for (int byteIndex = 0; byteIndex < numBytesToEncode; byteIndex++) { |
| final int theByte = byteArray[byteIndex] & 0XFF; |
| final long codeInfo = (encodingTable[theByte] & 0XFFFFL); |
| final long codeVal = codeInfo & 0XFFFL; |
| final int codeWordLength = (int) (codeInfo >>> 12); |
| bitBuf |= (codeVal << bufBits); |
| bufBits += codeWordLength; |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| } |
| |
| //Pad the bitstream with 11 zero-bits so that the decompressor's 12-bit peek |
| // can't overrun its input. |
| bufBits += 11; |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| |
| if (bufBits > 0) { // We are done encoding now, so we flush the bit buffer. |
| assert (bufBits < 32); |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| } |
| return nextWordIndex; |
| } |
| |
| //visible for test |
| static void lowLevelUncompressBytes( |
| final byte[] byteArray, // output |
| final int numBytesToDecode, // input (but refers to the output) |
| final short[] decodingTable, // input |
| final int[] compressedWords, // input |
| final long numCompressedWords) { // input |
| |
| int byteIndex = 0; |
| int nextWordIndex = 0; |
| long bitBuf = 0; |
| int bufBits = 0; |
| |
| assert (byteArray != null); |
| assert (decodingTable != null); |
| assert (compressedWords != null); |
| |
| for (byteIndex = 0; byteIndex < numBytesToDecode; byteIndex++) { |
| //MAYBE_FILL_BITBUF(compressedWords,wordIndex,12); // ensure 12 bits in bit buffer |
| if (bufBits < 12) { // Prepare for a 12-bit peek into the bitstream. |
| bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits); |
| bufBits += 32; |
| } |
| |
| // These 12 bits will include an entire Huffman codeword. |
| final int peek12 = (int) (bitBuf & 0XFFFL); |
| final int lookup = decodingTable[peek12] & 0XFFFF; |
| final int codeWordLength = lookup >>> 8; |
| final byte decodedByte = (byte) (lookup & 0XFF); |
| byteArray[byteIndex] = decodedByte; |
| bitBuf >>>= codeWordLength; |
| bufBits -= codeWordLength; |
| } |
| |
| // Buffer over-run should be impossible unless there is a bug. |
| // However, we might as well check here. |
| assert (nextWordIndex <= numCompressedWords); |
| } |
| |
| /** |
| * Here "pairs" refers to row/column pairs that specify the positions of surprising values in |
| * the bit matrix. |
| * @param pairArray input |
| * @param numPairsToEncode input |
| * @param numBaseBits input |
| * @param compressedWords output |
| * @return the number of compressedWords actually used |
| */ |
| //visible for test |
| static long lowLevelCompressPairs( |
| final int[] pairArray, // input |
| final int numPairsToEncode, // input |
| final int numBaseBits, // input //cannot exceed 63 or 6 bits, could be byte |
| final int[] compressedWords) { // output |
| |
| int pairIndex = 0; |
| |
| final long[] ptrArr = new long[3]; |
| int nextWordIndex = 0; //must be int |
| long bitBuf = 0; //must be long |
| int bufBits = 0; //could be byte |
| |
| final long golombLoMask = (1L << numBaseBits) - 1L; |
| |
| int predictedRowIndex = 0; |
| int predictedColIndex = 0; |
| |
| for (pairIndex = 0; pairIndex < numPairsToEncode; pairIndex++) { |
| final int rowCol = pairArray[pairIndex]; |
| final int rowIndex = rowCol >>> 6; |
| final int colIndex = rowCol & 0X3F; //63 |
| |
| if (rowIndex != predictedRowIndex) { predictedColIndex = 0; } |
| |
| assert (rowIndex >= predictedRowIndex); |
| assert (colIndex >= predictedColIndex); |
| |
| final long yDelta = rowIndex - predictedRowIndex; //cannot exceed 2^26 |
| final int xDelta = colIndex - predictedColIndex; //cannot exceed 65 |
| |
| predictedRowIndex = rowIndex; |
| predictedColIndex = colIndex + 1; |
| |
| final long codeInfo = lengthLimitedUnaryEncodingTable65[xDelta] & 0XFFFFL; |
| final long codeVal = codeInfo & 0XFFFL; |
| final int codeLen = (int) (codeInfo >>> 12); |
| bitBuf |= (codeVal << bufBits); |
| bufBits += codeLen; |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| |
| final long golombLo = yDelta & golombLoMask; //long for bitBuf |
| final long golombHi = yDelta >>> numBaseBits; //cannot exceed 2^26 |
| |
| //TODO Inline WriteUnary |
| ptrArr[NEXT_WORD_IDX] = nextWordIndex; |
| ptrArr[BIT_BUF] = bitBuf; |
| ptrArr[BUF_BITS] = bufBits; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error |
| |
| writeUnary(compressedWords, ptrArr, (int) golombHi); |
| |
| nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; |
| bitBuf = ptrArr[BIT_BUF]; |
| bufBits = (int) ptrArr[BUF_BITS]; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error |
| //END Inline WriteUnary |
| |
| bitBuf |= golombLo << bufBits; |
| bufBits += numBaseBits; |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| } |
| |
| // Pad the bitstream so that the decompressor's 12-bit peek can't overrun its input. |
| int padding = 10 - numBaseBits; |
| if (padding < 0) { padding = 0; } |
| bufBits += padding; |
| //MAYBE_FLUSH_BITBUF(compressedWords, nextWordIndex); |
| if (bufBits >= 32) { |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| bitBuf >>>= 32; |
| bufBits -= 32; |
| } |
| |
| if (bufBits > 0) { // We are done encoding now, so we flush the bit buffer. |
| assert (bufBits < 32); |
| compressedWords[nextWordIndex++] = (int) bitBuf; |
| //bitBuf = 0; |
| //bufBits = 0; // not really necessary |
| } |
| return nextWordIndex; |
| } |
| |
| //visible for test |
| static void lowLevelUncompressPairs( |
| final int[] pairArray, // output |
| final int numPairsToDecode, // input, size of output, must be int |
| final int numBaseBits, // input, cannot exceed 6 bits |
| final int[] compressedWords, // input |
| final long numCompressedWords) { // input |
| |
| int pairIndex = 0; |
| |
| final long[] ptrArr = new long[3]; |
| int nextWordIndex = 0; |
| long bitBuf = 0; |
| int bufBits = 0; |
| |
| final long golombLoMask = (1L << numBaseBits) - 1L; |
| |
| int predictedRowIndex = 0; |
| int predictedColIndex = 0; |
| |
| // for each pair we need to read: |
| // xDelta (12-bit length-limited unary) |
| // yDeltaHi (unary) |
| // yDeltaLo (basebits) |
| |
| for (pairIndex = 0; pairIndex < numPairsToDecode; pairIndex++) { |
| |
| //MAYBE_FILL_BITBUF(compressedWords,wordIndex,12); // ensure 12 bits in bit buffer |
| if (bufBits < 12) { // Prepare for a 12-bit peek into the bitstream. |
| bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits); |
| bufBits += 32; |
| } |
| |
| final int peek12 = (int) (bitBuf & 0XFFFL); |
| final int lookup = lengthLimitedUnaryDecodingTable65[peek12] & 0XFFFF; |
| final int codeWordLength = lookup >>> 8; |
| final int xDelta = lookup & 0XFF; |
| bitBuf >>>= codeWordLength; |
| bufBits -= codeWordLength; |
| |
| //TODO Inline ReadUnary |
| ptrArr[NEXT_WORD_IDX] = nextWordIndex; |
| ptrArr[BIT_BUF] = bitBuf; |
| ptrArr[BUF_BITS] = bufBits; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch sign extension error |
| |
| final long golombHi = readUnary(compressedWords, ptrArr); |
| |
| nextWordIndex = (int) ptrArr[NEXT_WORD_IDX]; |
| bitBuf = ptrArr[BIT_BUF]; |
| bufBits = (int) ptrArr[BUF_BITS]; |
| assert nextWordIndex == ptrArr[NEXT_WORD_IDX]; //catch truncation error |
| //END Inline ReadUnary |
| |
| //MAYBE_FILL_BITBUF(compressedWords,wordIndex,numBaseBits); // ensure numBaseBits in bit buffer |
| if (bufBits < numBaseBits) { // Prepare for a numBaseBits peek into the bitstream. |
| bitBuf |= ((compressedWords[nextWordIndex++] & 0XFFFF_FFFFL) << bufBits); |
| bufBits += 32; |
| } |
| |
| final long golombLo = bitBuf & golombLoMask; |
| bitBuf >>>= numBaseBits; |
| bufBits -= numBaseBits; |
| final long yDelta = (golombHi << numBaseBits) | golombLo; |
| |
| // Now that we have yDelta and xDelta, we can compute the pair's row and column. |
| if (yDelta > 0) { predictedColIndex = 0; } |
| final int rowIndex = predictedRowIndex + (int) yDelta; |
| final int colIndex = predictedColIndex + xDelta; |
| final int rowCol = (rowIndex << 6) | colIndex; |
| pairArray[pairIndex] = rowCol; |
| predictedRowIndex = rowIndex; |
| predictedColIndex = colIndex + 1; |
| } |
| // check for buffer over-run |
| assert (nextWordIndex <= numCompressedWords) |
| : "nextWdIdx: " + nextWordIndex + ", #CompWds: " + numCompressedWords; |
| } |
| |
| private static int safeLengthForCompressedPairBuf( |
| final long k, final long numPairs, final long numBaseBits) { |
| assert (numPairs > 0); |
| // long ybits = k + numPairs; // simpler and safer UB |
| // The following tighter UB on ybits is based on page 198 |
| // of the textbook "Managing Gigabytes" by Witten, Moffat, and Bell. |
| // Notice that if numBaseBits == 0 it coincides with (k + numPairs). |
| final long ybits = (numPairs * (1L + numBaseBits)) + (k >>> numBaseBits); |
| final long xbits = 12 * numPairs; |
| long padding = 10L - numBaseBits; |
| if (padding < 0) { padding = 0; } |
| final long bits = xbits + ybits + padding; |
| //final long words = divideLongsRoundingUp(bits, 32); |
| final long words = CpcCompression.divideBy32RoundingUp(bits); |
| assert words < (1L << 31); |
| return (int) words; |
| } |
| |
| // Explanation of padding: we write |
| // 1) xdelta (huffman, provides at least 1 bit, requires 12-bit lookahead) |
| // 2) ydeltaGolombHi (unary, provides at least 1 bit, requires 8-bit lookahead) |
| // 3) ydeltaGolombLo (straight B bits). |
| // So the 12-bit lookahead is the tight constraint, but there are at least (2 + B) bits emitted, |
| // so we would be safe with max (0, 10 - B) bits of padding at the end of the bitstream. |
| private static int safeLengthForCompressedWindowBuf(final long k) { // measured in 32-bit words |
| // 11 bits of padding, due to 12-bit lookahead, with 1 bit certainly present. |
| final long bits = (12 * k) + 11; |
| //cannot exceed Integer.MAX_VALUE |
| //return (int) (divideLongsRoundingUp(bits, 32)); |
| return (int) CpcCompression.divideBy32RoundingUp(bits); |
| } |
| |
| private static int determinePseudoPhase(final int lgK, final long numCoupons) { |
| final long k = 1L << lgK; |
| final long c = numCoupons; |
| // This midrange logic produces pseudo-phases. They are used to select encoding tables. |
| // The thresholds were chosen by hand after looking at plots of measured compression. |
| if ((1000 * c) < (2375 * k)) { |
| if ( (4 * c) < (3 * k)) { return ( 16 + 0 ); } // midrange table |
| else if ( (10 * c) < (11 * k)) { return ( 16 + 1 ); } // midrange table |
| else if ( (100 * c) < (132 * k)) { return ( 16 + 2 ); } // midrange table |
| else if ( (3 * c) < (5 * k)) { return ( 16 + 3 ); } // midrange table |
| else if ((1000 * c) < (1965 * k)) { return ( 16 + 4 ); } // midrange table |
| else if ((1000 * c) < (2275 * k)) { return ( 16 + 5 ); } // midrange table |
| else { return 6; } // steady-state table employed before its actual phase |
| } |
| else { |
| // This steady-state logic produces true phases. They are used to select |
| // encoding tables, and also column permutations for the "Sliding" flavor. |
| assert lgK >= 4; |
| final long tmp = c >>> (lgK - 4); |
| final int phase = (int) (tmp & 15L); |
| assert (phase >= 0) && (phase < 16); |
| return phase; |
| } |
| } |
| |
| private static void compressTheWindow(final CompressedState target, final CpcSketch source) { |
| final int srcLgK = source.lgK; |
| final int srcK = 1 << srcLgK; |
| final int windowBufLen = safeLengthForCompressedWindowBuf(srcK); |
| final int[] windowBuf = new int[windowBufLen]; |
| final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons); |
| target.cwLengthInts = lowLevelCompressBytes( |
| source.slidingWindow, |
| srcK, |
| encodingTablesForHighEntropyByte[pseudoPhase], |
| windowBuf); |
| |
| // At this point we free the unused portion of the compression output buffer. |
| // final int[] shorterBuf = Arrays.copyOf(windowBuf, target.cwLength); |
| // target.compressedWindow = shorterBuf; |
| target.cwStream = windowBuf; //avoid extra copy |
| } |
| |
| private static void uncompressTheWindow(final CpcSketch target, final CompressedState source) { |
| final int srcLgK = source.lgK; |
| final int srcK = 1 << srcLgK; |
| final byte[] window = new byte[srcK]; |
| // bzero ((void *) window, (size_t) k); // zeroing not needed here (unlike the Hybrid Flavor) |
| assert (target.slidingWindow == null); |
| target.slidingWindow = window; |
| final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons); |
| assert (source.cwStream != null); |
| lowLevelUncompressBytes(target.slidingWindow, srcK, |
| decodingTablesForHighEntropyByte[pseudoPhase], |
| source.cwStream, |
| source.cwLengthInts); |
| } |
| |
| private static void compressTheSurprisingValues(final CompressedState target, final CpcSketch source, |
| final int[] pairs, final int numPairs) { |
| assert (numPairs > 0); |
| target.numCsv = numPairs; |
| final int srcK = 1 << source.lgK; |
| final int numBaseBits = CpcCompression.golombChooseNumberOfBaseBits(srcK + numPairs, numPairs); |
| final int pairBufLen = safeLengthForCompressedPairBuf(srcK, numPairs, numBaseBits); |
| final int[] pairBuf = new int[pairBufLen]; |
| |
| target.csvLengthInts = (int) lowLevelCompressPairs(pairs, numPairs, numBaseBits, pairBuf); |
| |
| // At this point we free the unused portion of the compression output buffer. |
| // final int[] shorterBuf = Arrays.copyOf(pairBuf, target.csvLength); |
| // target.compressedWindow = shorterBuf; |
| |
| target.csvStream = pairBuf; //avoid extra copy |
| } |
| |
| //allocates and returns an array of uncompressed pairs. |
| //the length of this array is known to the source sketch. |
| private static int[] uncompressTheSurprisingValues(final CompressedState source) { |
| final int srcK = 1 << source.lgK; |
| final int numPairs = source.numCsv; |
| assert numPairs > 0; |
| final int[] pairs = new int[numPairs]; |
| final int numBaseBits = CpcCompression.golombChooseNumberOfBaseBits(srcK + numPairs, numPairs); |
| lowLevelUncompressPairs(pairs, numPairs, numBaseBits, source.csvStream, source.csvLengthInts); |
| return pairs; |
| } |
| |
| private static void compressSparseFlavor(final CompressedState target, final CpcSketch source) { |
| assert (source.slidingWindow == null); //there is no window to compress |
| final PairTable srcPairTable = source.pairTable; |
| final int srcNumPairs = srcPairTable.getNumPairs(); |
| final int[] srcPairArr = PairTable.unwrappingGetItems(srcPairTable, srcNumPairs); |
| introspectiveInsertionSort(srcPairArr, 0, srcNumPairs - 1); |
| compressTheSurprisingValues(target, source, srcPairArr, srcNumPairs); |
| } |
| |
| private static void uncompressSparseFlavor(final CpcSketch target, final CompressedState source) { |
| assert (source.cwStream == null); |
| assert (source.csvStream != null); |
| final int[] srcPairArr = uncompressTheSurprisingValues(source); |
| final int numPairs = source.numCsv; |
| final PairTable table = PairTable.newInstanceFromPairsArray(srcPairArr, numPairs, source.lgK); |
| target.pairTable = table; |
| } |
| |
| //The empty space that this leaves at the beginning of the output array |
| // will be filled in later by the caller. |
| private static int[] trickyGetPairsFromWindow(final byte[] window, final int k, final int numPairsToGet, |
| final int emptySpace) { |
| final int outputLength = emptySpace + numPairsToGet; |
| final int[] pairs = new int[outputLength]; |
| int rowIndex = 0; |
| int pairIndex = emptySpace; |
| for (rowIndex = 0; rowIndex < k; rowIndex++) { |
| int wByte = window[rowIndex] & 0XFF; |
| while (wByte != 0) { |
| final int colIndex = Integer.numberOfTrailingZeros(wByte); |
| // assert (colIndex < 8); |
| wByte ^= (1 << colIndex); // erase the 1 |
| pairs[pairIndex++] = (rowIndex << 6) | colIndex; |
| } |
| } |
| assert (pairIndex == outputLength); |
| return (pairs); |
| } |
| |
| //This is complicated because it effectively builds a Sparse version |
| //of a Pinned sketch before compressing it. Hence the name Hybrid. |
| private static void compressHybridFlavor(final CompressedState target, final CpcSketch source) { |
| final int srcK = 1 << source.lgK; |
| final PairTable srcPairTable = source.pairTable; |
| final int srcNumPairs = srcPairTable.getNumPairs(); |
| final int[] srcPairArr = PairTable.unwrappingGetItems(srcPairTable, srcNumPairs); |
| introspectiveInsertionSort(srcPairArr, 0, srcNumPairs - 1); |
| final byte[] srcSlidingWindow = source.slidingWindow; |
| final int srcWindowOffset = source.windowOffset; |
| final long srcNumCoupons = source.numCoupons; |
| assert (srcSlidingWindow != null); |
| assert (srcWindowOffset == 0); |
| final long numPairs = srcNumCoupons - srcNumPairs; // because the window offset is zero |
| assert numPairs < Integer.MAX_VALUE; |
| final int numPairsFromArray = (int) numPairs; |
| |
| assert (numPairsFromArray + srcNumPairs) == srcNumCoupons; //for test |
| final int[] allPairs |
| = trickyGetPairsFromWindow(srcSlidingWindow, srcK, numPairsFromArray, srcNumPairs); |
| |
| PairTable.merge(srcPairArr, 0, srcNumPairs, |
| allPairs, srcNumPairs, numPairsFromArray, |
| allPairs, 0); // note the overlapping subarray trick |
| |
| //FOR TESTING If needed |
| // for (int i = 0; i < (source.numCoupons - 1); i++) { |
| // assert (Integer.compareUnsigned(allPairs[i], allPairs[i + 1]) < 0); } |
| |
| compressTheSurprisingValues(target, source, allPairs, (int) srcNumCoupons); |
| } |
| |
| private static void uncompressHybridFlavor(final CpcSketch target, final CompressedState source) { |
| assert (source.cwStream == null); |
| assert (source.csvStream != null); |
| final int[] pairs = uncompressTheSurprisingValues(source); //fail path 3 |
| final int numPairs = source.numCsv; |
| // In the hybrid flavor, some of these pairs actually |
| // belong in the window, so we will separate them out, |
| // moving the "true" pairs to the bottom of the array. |
| final int srcLgK = source.lgK; |
| final int k = 1 << srcLgK; |
| |
| final byte[] window = new byte[k]; |
| |
| int nextTruePair = 0; |
| |
| for (int i = 0; i < numPairs; i++) { |
| final int rowCol = pairs[i]; |
| assert (rowCol != -1); |
| final int col = rowCol & 63; |
| if (col < 8) { |
| final int row = rowCol >>> 6; |
| window[row] |= (1 << col); // set the window bit |
| } |
| else { |
| pairs[nextTruePair++] = rowCol; // move true pair down |
| } |
| } |
| |
| assert (source.getWindowOffset() == 0); |
| target.windowOffset = 0; |
| |
| final PairTable table = PairTable.newInstanceFromPairsArray(pairs, nextTruePair, srcLgK); |
| target.pairTable = table; |
| target.slidingWindow = window; |
| } |
| |
| private static void compressPinnedFlavor(final CompressedState target, final CpcSketch source) { |
| compressTheWindow(target, source); |
| final PairTable srcPairTable = source.pairTable; |
| final int numPairs = srcPairTable.getNumPairs(); |
| |
| if (numPairs > 0) { |
| final int[] pairs = PairTable.unwrappingGetItems(srcPairTable, numPairs); |
| |
| // Here we subtract 8 from the column indices. Because they are stored in the low 6 bits |
| // of each rowCol pair, and because no column index is less than 8 for a "Pinned" sketch, |
| // I believe we can simply subtract 8 from the pairs themselves. |
| |
| // shift the columns over by 8 positions before compressing (because of the window) |
| for (int i = 0; i < numPairs; i++) { |
| assert (pairs[i] & 63) >= 8; |
| pairs[i] -= 8; |
| } |
| |
| introspectiveInsertionSort(pairs, 0, numPairs - 1); |
| compressTheSurprisingValues(target, source, pairs, numPairs); |
| } |
| } |
| |
| private static void uncompressPinnedFlavor(final CpcSketch target, final CompressedState source) { |
| assert (source.cwStream != null); |
| uncompressTheWindow(target, source); |
| final int srcLgK = source.lgK; |
| final int numPairs = source.numCsv; |
| if (numPairs == 0) { |
| target.pairTable = new PairTable(2, 6 + srcLgK); |
| } |
| else { |
| assert numPairs > 0; |
| assert source.csvStream != null; |
| final int[] pairs = uncompressTheSurprisingValues(source); |
| // undo the compressor's 8-column shift |
| for (int i = 0; i < numPairs; i++) { |
| assert (pairs[i] & 63) < 56; |
| pairs[i] += 8; |
| } |
| final PairTable table = PairTable.newInstanceFromPairsArray(pairs, numPairs, srcLgK); |
| target.pairTable = table; |
| } |
| } |
| |
| //Complicated by the existence of both a left fringe and a right fringe. |
| private static void compressSlidingFlavor(final CompressedState target, final CpcSketch source) { |
| |
| compressTheWindow(target, source); |
| final PairTable srcPairTable = source.pairTable; |
| |
| final int numPairs = srcPairTable.getNumPairs(); |
| |
| if (numPairs > 0) { |
| final int[] pairs = PairTable.unwrappingGetItems(srcPairTable, numPairs); |
| |
| // Here we apply a complicated transformation to the column indices, which |
| // changes the implied ordering of the pairs, so we must do it before sorting. |
| |
| final int pseudoPhase = determinePseudoPhase(source.lgK, source.numCoupons); // NB |
| assert (pseudoPhase < 16); |
| final byte[] permutation = columnPermutationsForEncoding[pseudoPhase]; |
| |
| final int offset = source.windowOffset; |
| assert ((offset > 0) && (offset <= 56)); |
| |
| for (int i = 0; i < numPairs; i++) { |
| final int rowCol = pairs[i]; |
| final int row = rowCol >>> 6; |
| int col = (rowCol & 63); |
| // first rotate the columns into a canonical configuration: |
| // new = ((old - (offset+8)) + 64) mod 64 |
| col = ((col + 56) - offset) & 63; |
| assert (col >= 0) && (col < 56); |
| // then apply the permutation |
| col = permutation[col]; |
| pairs[i] = (row << 6) | col; |
| } |
| |
| introspectiveInsertionSort(pairs, 0, numPairs - 1); |
| compressTheSurprisingValues(target, source, pairs, numPairs); |
| } |
| } |
| |
| private static void uncompressSlidingFlavor(final CpcSketch target, final CompressedState source) { |
| assert (source.cwStream != null); |
| uncompressTheWindow(target, source); |
| final int srcLgK = source.lgK; |
| final int numPairs = source.numCsv; |
| if (numPairs == 0) { |
| target.pairTable = new PairTable(2, 6 + srcLgK); |
| } |
| else { |
| assert (numPairs > 0); |
| assert (source.csvStream != null); |
| final int[] pairs = uncompressTheSurprisingValues(source); |
| final int pseudoPhase = determinePseudoPhase(srcLgK, source.numCoupons); // NB |
| assert (pseudoPhase < 16); |
| final byte[] permutation = columnPermutationsForDecoding[pseudoPhase]; |
| |
| final int offset = source.getWindowOffset(); |
| assert (offset > 0) && (offset <= 56); |
| |
| for (int i = 0; i < numPairs; i++) { |
| final int rowCol = pairs[i]; |
| final int row = rowCol >>> 6; |
| int col = rowCol & 63; |
| // first undo the permutation |
| col = permutation[col]; |
| // then undo the rotation: old = (new + (offset+8)) mod 64 |
| col = (col + (offset + 8)) & 63; |
| pairs[i] = (row << 6) | col; |
| } |
| |
| final PairTable table = PairTable.newInstanceFromPairsArray(pairs, numPairs, srcLgK); |
| target.pairTable = table; |
| } |
| } |
| |
| static CompressedState compress(final CpcSketch source, final CompressedState target) { |
| |
| final Flavor srcFlavor = source.getFlavor(); |
| |
| switch (srcFlavor) { |
| case EMPTY: break; |
| case SPARSE: |
| compressSparseFlavor(target, source); |
| assert (target.cwStream == null); |
| assert (target.csvStream != null); |
| break; |
| case HYBRID: |
| compressHybridFlavor(target, source); |
| assert (target.cwStream == null); |
| assert (target.csvStream != null); |
| break; |
| case PINNED: |
| compressPinnedFlavor(target, source); |
| assert (target.cwStream != null); |
| break; |
| case SLIDING: |
| compressSlidingFlavor(target, source); |
| assert (target.cwStream != null); |
| break; |
| //default: not possible |
| } |
| |
| return target; |
| } |
| |
| static CpcSketch uncompress(final CompressedState source, final CpcSketch target) { |
| assert (target != null); |
| |
| final Flavor srcFlavor = source.getFlavor(); |
| switch (srcFlavor) { |
| case EMPTY: break; |
| case SPARSE: |
| assert (source.cwStream == null); |
| uncompressSparseFlavor(target, source); |
| break; |
| case HYBRID: |
| uncompressHybridFlavor(target, source); |
| break; |
| case PINNED: |
| assert (source.cwStream != null); |
| uncompressPinnedFlavor(target, source); |
| break; |
| case SLIDING: |
| uncompressSlidingFlavor(target, source); |
| break; |
| //default: not possible |
| } |
| return target; |
| } |
| |
| private static int golombChooseNumberOfBaseBits(final int k, final long count) { |
| assert k >= 1L; |
| assert count >= 1L; |
| final long quotient = (k - count) / count; // integer division |
| if (quotient == 0) { return 0; } |
| return (int) floorLog2ofLong(quotient); |
| } |
| |
| private static long floorLog2ofLong(final long x) { //not a good name |
| assert (x >= 1L); |
| long p = 0; |
| long y = 1; |
| while (true) { |
| if (y == x) { return p; } |
| if (y > x) { return p - 1; } |
| p += 1; |
| y <<= 1; |
| } |
| } |
| |
| |
| private static long divideBy32RoundingUp(final long x) { |
| final long tmp = x >>> 5; |
| return ((tmp << 5) == x) ? tmp : tmp + 1; |
| } |
| |
| } |