| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.datasketches.cpc; |
| |
| import static java.lang.Math.log; |
| import static java.lang.Math.sqrt; |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED; |
| import static org.apache.datasketches.Util.checkSeedHashes; |
| import static org.apache.datasketches.Util.computeSeedHash; |
| import static org.apache.datasketches.Util.invPow2; |
| import static org.apache.datasketches.Util.zeroPad; |
| import static org.apache.datasketches.cpc.CpcUtil.bitMatrixOfSketch; |
| import static org.apache.datasketches.cpc.CpcUtil.checkLgK; |
| import static org.apache.datasketches.cpc.CpcUtil.countBitsSetInMatrix; |
| import static org.apache.datasketches.hash.MurmurHash3.hash; |
| |
| import java.util.Arrays; |
| |
| import org.apache.datasketches.Family; |
| import org.apache.datasketches.memory.Memory; |
| import org.apache.datasketches.memory.WritableMemory; |
| |
| /** |
| * This is a unique-counting sketch that implements the |
| * <i>Compressed Probabilistic Counting (CPC, a.k.a FM85)</i> algorithms developed by Kevin Lang in |
| * his paper |
| * <a href="https://arxiv.org/abs/1708.06839">Back to the Future: an Even More Nearly |
| * Optimal Cardinality Estimation Algorithm</a>. |
| * |
| * <p>This sketch is extremely space-efficient when serialized. In an apples-to-apples empirical |
| * comparison against compressed HyperLogLog sketches, this new algorithm simultaneously wins on |
| * the two dimensions of the space/accuracy tradeoff and produces sketches that are |
| * smaller than the entropy of HLL, so no possible implementation of compressed HLL can match its |
| * space efficiency for a given accuracy. As described in the paper this sketch implements a newly |
| * developed ICON estimator algorithm that survives unioning operations, another |
| * well-known estimator, the |
| * <a href="https://arxiv.org/abs/1306.3284">Historical Inverse Probability (HIP)</a> estimator |
| * does not. |
| * The update speed performance of this sketch is quite fast and is comparable to the speed of HLL. |
| * The unioning (merging) capability of this sketch also allows for merging of sketches with |
| * different configurations of K. |
| * |
| * <p>For additional security this sketch can be configured with a user-specified hash seed. |
| * |
| * @author Lee Rhodes |
| * @author Kevin Lang |
| */ |
| public final class CpcSketch { |
| private static final String LS = System.getProperty("line.separator"); |
| private static final double[] kxpByteLookup = new double[256]; |
| |
| /** |
| * The default Log_base2 of K |
| */ |
| public static final int DEFAULT_LG_K = 11; |
| final long seed; |
| //common variables |
| final int lgK; |
| long numCoupons; // The number of coupons collected so far. |
| boolean mergeFlag; // Is the sketch the result of merging? |
| int fiCol; // First Interesting Column. This is part of a speed optimization. |
| |
| int windowOffset; |
| byte[] slidingWindow; //either null or size K bytes |
| PairTable pairTable; //for sparse and surprising values, either null or variable size |
| |
| //The following variables are only valid in HIP varients |
| double kxp; //used with HIP |
| double hipEstAccum; //used with HIP |
| |
| /** |
| * Constructor with default log_base2 of k |
| */ |
| public CpcSketch() { |
| this(DEFAULT_LG_K, DEFAULT_UPDATE_SEED); |
| } |
| |
| /** |
| * Constructor with log_base2 of k. |
| * @param lgK the given log_base2 of k |
| */ |
| public CpcSketch(final int lgK) { |
| this(lgK, DEFAULT_UPDATE_SEED); |
| } |
| |
| /** |
| * Constructor with log_base2 of k and seed. |
| * @param lgK the given log_base2 of k |
| * @param seed the given seed |
| */ |
| public CpcSketch(final int lgK, final long seed) { |
| checkLgK(lgK); |
| this.lgK = (byte) lgK; |
| this.seed = seed; |
| kxp = 1 << lgK; |
| reset(); |
| } |
| |
| /** |
| * Returns a copy of this sketch |
| * @return a copy of this sketch |
| */ |
| CpcSketch copy() { |
| final CpcSketch copy = new CpcSketch(lgK, seed); |
| copy.numCoupons = numCoupons; |
| copy.mergeFlag = mergeFlag; |
| copy.fiCol = fiCol; |
| |
| copy.windowOffset = windowOffset; |
| copy.slidingWindow = (slidingWindow == null) ? null : slidingWindow.clone(); |
| copy.pairTable = (pairTable == null) ? null : pairTable.copy(); |
| |
| copy.kxp = kxp; |
| copy.hipEstAccum = hipEstAccum; |
| return copy; |
| } |
| |
| /** |
| * Returns the best estimate of the cardinality of the sketch. |
| * @return the best estimate of the cardinality of the sketch. |
| */ |
| public double getEstimate() { |
| if (mergeFlag) { return IconEstimator.getIconEstimate(lgK, numCoupons); } |
| return hipEstAccum; |
| } |
| |
| /** |
| * Return the DataSketches identifier for this CPC family of sketches. |
| * @return the DataSketches identifier for this CPC family of sketches. |
| */ |
| public static Family getFamily() { |
| return Family.CPC; |
| } |
| |
| /** |
| * Return the parameter LgK. |
| * @return the parameter LgK. |
| */ |
| public int getLgK() { |
| return lgK; |
| } |
| |
| /** |
| * Returns the best estimate of the lower bound of the confidence interval given <i>kappa</i>, |
| * the number of standard deviations from the mean. |
| * @param kappa the given number of standard deviations from the mean: 1, 2 or 3. |
| * @return the best estimate of the lower bound of the confidence interval given <i>kappa</i>. |
| */ |
| public double getLowerBound(final int kappa) { |
| if (mergeFlag) { |
| return CpcConfidence.getIconConfidenceLB(lgK, numCoupons, kappa); |
| } |
| return CpcConfidence.getHipConfidenceLB(lgK, numCoupons, hipEstAccum, kappa); |
| } |
| |
| /* |
| * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000 |
| * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally |
| * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the |
| * worst-case space for the preamble, which is added by the function. |
| */ |
| private static final int[] empiricalMaxBytes = { |
| 24, // lgK = 4 |
| 36, // lgK = 5 |
| 56, // lgK = 6 |
| 100, // lgK = 7 |
| 180, // lgK = 8 |
| 344, // lgK = 9 |
| 660, // lgK = 10 |
| 1292, // lgK = 11 |
| 2540, // lgK = 12 |
| 5020, // lgK = 13 |
| 9968, // lgK = 14 |
| 19836, // lgK = 15 |
| 39532, // lgK = 16 |
| 78880, // lgK = 17 |
| 157516, // lgK = 18 |
| 314656 // lgK = 19 |
| }; |
| |
| /** |
| * The actual size of a compressed CPC sketch has a small random variance, but the following |
| * empirically measured size should be large enough for at least 99.9 percent of sketches. |
| * |
| * <p>For small values of <i>n</i> the size can be much smaller. |
| * |
| * @param lgK the given value of lgK. |
| * @return the estimated maximum compressed serialized size of a sketch. |
| */ |
| public static int getMaxSerializedBytes(final int lgK) { |
| checkLgK(lgK); |
| if (lgK <= 19) { return empiricalMaxBytes[lgK - 4] + 40; } |
| final int k = 1 << lgK; |
| return (int) (0.6 * k) + 40; // 0.6 = 4.8 / 8.0 |
| } |
| |
| /** |
| * Returns the best estimate of the upper bound of the confidence interval given <i>kappa</i>, |
| * the number of standard deviations from the mean. |
| * @param kappa the given number of standard deviations from the mean: 1, 2 or 3. |
| * @return the best estimate of the upper bound of the confidence interval given <i>kappa</i>. |
| */ |
| public double getUpperBound(final int kappa) { |
| if (mergeFlag) { |
| return CpcConfidence.getIconConfidenceUB(lgK, numCoupons, kappa); |
| } |
| return CpcConfidence.getHipConfidenceUB(lgK, numCoupons, hipEstAccum, kappa); |
| } |
| |
| /** |
| * Return the given Memory as a CpcSketch on the Java heap using the DEFAULT_UPDATE_SEED. |
| * @param mem the given Memory |
| * @return the given Memory as a CpcSketch on the Java heap. |
| */ |
| public static CpcSketch heapify(final Memory mem) { |
| return heapify(mem, DEFAULT_UPDATE_SEED); |
| } |
| |
| /** |
| * Return the given byte array as a CpcSketch on the Java heap using the DEFAULT_UPDATE_SEED. |
| * @param byteArray the given byte array |
| * @return the given byte array as a CpcSketch on the Java heap. |
| */ |
| public static CpcSketch heapify(final byte[] byteArray) { |
| return heapify(byteArray, DEFAULT_UPDATE_SEED); |
| } |
| |
| /** |
| * Return the given Memory as a CpcSketch on the Java heap. |
| * @param mem the given Memory |
| * @param seed the seed used to create the original sketch from which the Memory was derived. |
| * @return the given Memory as a CpcSketch on the Java heap. |
| */ |
| public static CpcSketch heapify(final Memory mem, final long seed) { |
| final CompressedState state = CompressedState.importFromMemory(mem); |
| return uncompress(state, seed); |
| } |
| |
| /** |
| * Return the given byte array as a CpcSketch on the Java heap. |
| * @param byteArray the given byte array |
| * @param seed the seed used to create the original sketch from which the byte array was derived. |
| * @return the given byte array as a CpcSketch on the Java heap. |
| */ |
| public static CpcSketch heapify(final byte[] byteArray, final long seed) { |
| final Memory mem = Memory.wrap(byteArray); |
| return heapify(mem, seed); |
| } |
| |
| /** |
| * Return true if this sketch is empty |
| * @return true if this sketch is empty |
| */ |
| public boolean isEmpty() { |
| return numCoupons == 0; |
| } |
| |
| /** |
| * Resets this sketch to empty but retains the original LgK and Seed. |
| */ |
| public final void reset() { |
| numCoupons = 0; |
| mergeFlag = false; |
| fiCol = 0; |
| |
| windowOffset = 0; |
| slidingWindow = null; |
| pairTable = null; |
| |
| kxp = 1 << lgK; |
| hipEstAccum = 0; |
| } |
| |
| /** |
| * Return this sketch as a compressed byte array. |
| * @return this sketch as a compressed byte array. |
| */ |
| public byte[] toByteArray() { |
| final CompressedState state = CompressedState.compress(this); |
| final long cap = state.getRequiredSerializedBytes(); |
| final WritableMemory wmem = WritableMemory.allocate((int) cap); |
| state.exportToMemory(wmem); |
| return (byte[]) wmem.getArray(); |
| } |
| |
| /** |
| * Present the given long as a potential unique item. |
| * |
| * @param datum The given long datum. |
| */ |
| public void update(final long datum) { |
| final long[] data = { datum }; |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given double (or float) datum as a potential unique item. |
| * The double will be converted to a long using Double.doubleToLongBits(datum), |
| * which normalizes all NaN values to a single NaN representation. |
| * Plus and minus zero will be normalized to plus zero. |
| * The special floating-point values NaN and +/- Infinity are treated as distinct. |
| * |
| * @param datum The given double datum. |
| */ |
| public void update(final double datum) { |
| final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 |
| final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given String as a potential unique item. |
| * The string is converted to a byte array using UTF8 encoding. |
| * If the string is null or empty no update attempt is made and the method returns. |
| * |
| * <p>Note: About 2X faster performance can be obtained by first converting the String to a |
| * char[] and updating the sketch with that. This bypasses the complexity of the Java UTF_8 |
| * encoding. This, of course, will not produce the same internal hash values as updating directly |
| * with a String. So be consistent! Unioning two sketches, one fed with strings and the other |
| * fed with char[] will be meaningless. |
| * </p> |
| * |
| * @param datum The given String. |
| */ |
| public void update(final String datum) { |
| if ((datum == null) || datum.isEmpty()) { return; } |
| final byte[] data = datum.getBytes(UTF_8); |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given byte array as a potential unique item. |
| * If the byte array is null or empty no update attempt is made and the method returns. |
| * |
| * @param data The given byte array. |
| */ |
| public void update(final byte[] data) { |
| if ((data == null) || (data.length == 0)) { return; } |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given char array as a potential unique item. |
| * If the char array is null or empty no update attempt is made and the method returns. |
| * |
| * <p>Note: this will not produce the same output hash values as the {@link #update(String)} |
| * method but will be a little faster as it avoids the complexity of the UTF8 encoding.</p> |
| * |
| * @param data The given char array. |
| */ |
| public void update(final char[] data) { |
| if ((data == null) || (data.length == 0)) { return; } |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given integer array as a potential unique item. |
| * If the integer array is null or empty no update attempt is made and the method returns. |
| * |
| * @param data The given int array. |
| */ |
| public void update(final int[] data) { |
| if ((data == null) || (data.length == 0)) { return; } |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Present the given long array as a potential unique item. |
| * If the long array is null or empty no update attempt is made and the method returns. |
| * |
| * @param data The given long array. |
| */ |
| public void update(final long[] data) { |
| if ((data == null) || (data.length == 0)) { return; } |
| final long[] arr = hash(data, seed); |
| hashUpdate(arr[0], arr[1]); |
| } |
| |
| /** |
| * Convience function that this Sketch is valid. This is a troubleshooting tool |
| * for sketches that have been heapified from serialized images. |
| * |
| * <p>If you are starting with a serialized image as a byte array, first heapify |
| * the byte array to a sketch, which performs a number of checks. Then use this |
| * function as one additional check on the sketch.</p> |
| * |
| * @return true if this sketch is validated. |
| */ |
| public boolean validate() { |
| final long[] bitMatrix = bitMatrixOfSketch(this); |
| final long matrixCoupons = countBitsSetInMatrix(bitMatrix); |
| return matrixCoupons == numCoupons; |
| } |
| |
| /** |
| * Returns the current Flavor of this sketch. |
| * @return the current Flavor of this sketch. |
| */ |
| Flavor getFlavor() { |
| return CpcUtil.determineFlavor(lgK, numCoupons); |
| } |
| |
| /** |
| * Returns the Format of the serialized form of this sketch. |
| * @return the Format of the serialized form of this sketch. |
| */ |
| Format getFormat() { |
| final int ordinal; |
| final Flavor f = getFlavor(); |
| if ((f == Flavor.HYBRID) || (f == Flavor.SPARSE)) { |
| ordinal = 2 | ( mergeFlag ? 0 : 1 ); //Hybrid is serialized as SPARSE |
| } else { |
| ordinal = ((slidingWindow != null) ? 4 : 0) |
| | (((pairTable != null) && (pairTable.getNumPairs() > 0)) ? 2 : 0) |
| | ( mergeFlag ? 0 : 1 ); |
| } |
| return Format.ordinalToFormat(ordinal); |
| } |
| |
| private static void promoteEmptyToSparse(final CpcSketch sketch) { |
| assert sketch.numCoupons == 0; |
| assert sketch.pairTable == null; |
| sketch.pairTable = new PairTable(2, 6 + sketch.lgK); |
| } |
| |
| //In terms of flavor, this promotes SPARSE to HYBRID. |
| private static void promoteSparseToWindowed(final CpcSketch sketch) { |
| final int lgK = sketch.lgK; |
| final int k = (1 << lgK); |
| final long c32 = sketch.numCoupons << 5; |
| assert ((c32 == (3 * k)) || ((lgK == 4) && (c32 > (3 * k)))); |
| |
| final byte[] window = new byte[k]; |
| |
| final PairTable newTable = new PairTable(2, 6 + lgK); |
| final PairTable oldTable = sketch.pairTable; |
| |
| final int[] oldSlots = oldTable.getSlotsArr(); |
| final int oldNumSlots = (1 << oldTable.getLgSizeInts()); |
| |
| assert (sketch.windowOffset == 0); |
| |
| for (int i = 0; i < oldNumSlots; i++) { |
| final int rowCol = oldSlots[i]; |
| if (rowCol != -1) { |
| final int col = rowCol & 63; |
| if (col < 8) { |
| final int row = rowCol >>> 6; |
| window[row] |= (1 << col); |
| } |
| else { |
| // cannot use Table.mustInsert(), because it doesn't provide for growth |
| final boolean isNovel = PairTable.maybeInsert(newTable, rowCol); |
| assert (isNovel == true); |
| } |
| } |
| } |
| |
| assert (sketch.slidingWindow == null); |
| sketch.slidingWindow = window; |
| sketch.pairTable = newTable; |
| } |
| |
| /** |
| * The KXP register is a double with roughly 50 bits of precision, but |
| * it might need roughly 90 bits to track the value with perfect accuracy. |
| * Therefore we recalculate KXP occasionally from the sketch's full bitMatrix |
| * so that it will reflect changes that were previously outside the mantissa. |
| * @param sketch the given sketch |
| * @param bitMatrix the given bit Matrix |
| */ |
| //Also used in test |
| static void refreshKXP(final CpcSketch sketch, final long[] bitMatrix) { |
| final int k = (1 << sketch.lgK); |
| |
| // for improved numerical accuracy, we separately sum the bytes of the U64's |
| final double[] byteSums = new double[8]; |
| |
| Arrays.fill(byteSums, 0.0); |
| |
| for (int i = 0; i < k; i++) { |
| long row = bitMatrix[i]; |
| for (int j = 0; j < 8; j++) { |
| final int byteIdx = (int) (row & 0XFFL); |
| byteSums[j] += kxpByteLookup[byteIdx]; |
| row >>>= 8; |
| } |
| } |
| |
| double total = 0.0; |
| |
| for (int j = 7; j-- > 0; ) { // the reverse order is important |
| final double factor = invPow2(8 * j); // pow(256, -j) == pow(2, -8 * j); |
| total += factor * byteSums[j]; |
| } |
| sketch.kxp = total; |
| } |
| |
| /** |
| * This moves the sliding window |
| * @param sketch the given sketch |
| * @param newOffset the new offset, which must be oldOffset + 1 |
| */ |
| private static void modifyOffset(final CpcSketch sketch, final int newOffset) { |
| assert ((newOffset >= 0) && (newOffset <= 56)); |
| assert (newOffset == (sketch.windowOffset + 1)); |
| assert (newOffset == CpcUtil.determineCorrectOffset(sketch.lgK, sketch.numCoupons)); |
| |
| assert (sketch.slidingWindow != null); |
| assert (sketch.pairTable != null); |
| final int k = 1 << sketch.lgK; |
| |
| // Construct the full-sized bit matrix that corresponds to the sketch |
| final long[] bitMatrix = CpcUtil.bitMatrixOfSketch(sketch); |
| |
| // refresh the KXP register on every 8th window shift. |
| if ((newOffset & 0x7) == 0) { refreshKXP(sketch, bitMatrix); } |
| |
| sketch.pairTable.clear(); |
| |
| final PairTable table = sketch.pairTable; |
| final byte[] window = sketch.slidingWindow; |
| final long maskForClearingWindow = (0XFFL << newOffset) ^ -1L; |
| final long maskForFlippingEarlyZone = (1L << newOffset) - 1L; |
| long allSurprisesORed = 0; |
| |
| for (int i = 0; i < k; i++) { |
| long pattern = bitMatrix[i]; |
| window[i] = (byte) ((pattern >>> newOffset) & 0XFFL); |
| pattern &= maskForClearingWindow; |
| // The following line converts surprising 0's to 1's in the "early zone", |
| // (and vice versa, which is essential for this procedure's O(k) time cost). |
| pattern ^= maskForFlippingEarlyZone; |
| allSurprisesORed |= pattern; // a cheap way to recalculate fiCol |
| while (pattern != 0) { |
| final int col = Long.numberOfTrailingZeros(pattern); |
| pattern = pattern ^ (1L << col); // erase the 1. |
| final int rowCol = (i << 6) | col; |
| final boolean isNovel = PairTable.maybeInsert(table, rowCol); |
| assert isNovel == true; |
| } |
| } |
| sketch.windowOffset = newOffset; |
| sketch.fiCol = Long.numberOfTrailingZeros(allSurprisesORed); |
| if (sketch.fiCol > newOffset) { |
| sketch.fiCol = newOffset; // corner case |
| } |
| } |
| |
| /** |
| * Call this whenever a new coupon has been collected. |
| * @param sketch the given sketch |
| * @param rowCol the given row / column |
| */ |
| private static void updateHIP(final CpcSketch sketch, final int rowCol) { |
| final int k = 1 << sketch.lgK; |
| final int col = rowCol & 63; |
| final double oneOverP = k / sketch.kxp; |
| sketch.hipEstAccum += oneOverP; |
| sketch.kxp -= invPow2(col + 1); // notice the "+1" |
| } |
| |
| private static void updateSparse(final CpcSketch sketch, final int rowCol) { |
| final int k = 1 << sketch.lgK; |
| final long c32pre = sketch.numCoupons << 5; |
| assert (c32pre < (3L * k)); // C < 3K/32, in other words, flavor == SPARSE |
| assert (sketch.pairTable != null); |
| final boolean isNovel = PairTable.maybeInsert(sketch.pairTable, rowCol); |
| if (isNovel) { |
| sketch.numCoupons += 1; |
| updateHIP(sketch, rowCol); |
| final long c32post = sketch.numCoupons << 5; |
| if (c32post >= (3L * k)) { promoteSparseToWindowed(sketch); } // C >= 3K/32 |
| } |
| } |
| |
| /** |
| * The flavor is HYBRID, PINNED, or SLIDING. |
| * @param sketch the given sketch |
| * @param rowCol the given rowCol |
| */ |
| private static void updateWindowed(final CpcSketch sketch, final int rowCol) { |
| assert ((sketch.windowOffset >= 0) && (sketch.windowOffset <= 56)); |
| final int k = 1 << sketch.lgK; |
| final long c32pre = sketch.numCoupons << 5; |
| assert c32pre >= (3L * k); // C < 3K/32, in other words flavor >= HYBRID |
| final long c8pre = sketch.numCoupons << 3; |
| final int w8pre = sketch.windowOffset << 3; |
| assert c8pre < ((27L + w8pre) * k); // C < (K * 27/8) + (K * windowOffset) |
| |
| boolean isNovel = false; //novel if new coupon |
| final int col = rowCol & 63; |
| |
| if (col < sketch.windowOffset) { // track the surprising 0's "before" the window |
| isNovel = PairTable.maybeDelete(sketch.pairTable, rowCol); // inverted logic |
| } |
| else if (col < (sketch.windowOffset + 8)) { // track the 8 bits inside the window |
| assert (col >= sketch.windowOffset); |
| final int row = rowCol >>> 6; |
| final byte oldBits = sketch.slidingWindow[row]; |
| final byte newBits = (byte) (oldBits | (1 << (col - sketch.windowOffset))); |
| if (newBits != oldBits) { |
| sketch.slidingWindow[row] = newBits; |
| isNovel = true; |
| } |
| } |
| else { // track the surprising 1's "after" the window |
| assert col >= (sketch.windowOffset + 8); |
| isNovel = PairTable.maybeInsert(sketch.pairTable, rowCol); // normal logic |
| } |
| |
| if (isNovel) { |
| sketch.numCoupons += 1; |
| updateHIP(sketch, rowCol); |
| final long c8post = sketch.numCoupons << 3; |
| if (c8post >= ((27L + w8pre) * k)) { |
| modifyOffset(sketch, sketch.windowOffset + 1); |
| assert (sketch.windowOffset >= 1) && (sketch.windowOffset <= 56); |
| final int w8post = sketch.windowOffset << 3; |
| assert c8post < ((27L + w8post) * k); // C < (K * 27/8) + (K * windowOffset) |
| } |
| } |
| } |
| |
| |
| //also used in test |
| static CpcSketch uncompress(final CompressedState source, final long seed) { |
| checkSeedHashes(computeSeedHash(seed), source.seedHash); |
| final CpcSketch sketch = new CpcSketch(source.lgK, seed); |
| sketch.numCoupons = source.numCoupons; |
| sketch.windowOffset = source.getWindowOffset(); |
| sketch.fiCol = source.fiCol; |
| sketch.mergeFlag = source.mergeFlag; |
| sketch.kxp = source.kxp; |
| sketch.hipEstAccum = source.hipEstAccum; |
| sketch.slidingWindow = null; |
| sketch.pairTable = null; |
| CpcCompression.uncompress(source, sketch); |
| return sketch; |
| } |
| |
| //Used here and for testing |
| void hashUpdate(final long hash0, final long hash1) { |
| int col = Long.numberOfLeadingZeros(hash1); |
| if (col < fiCol) { return; } // important speed optimization |
| if (col > 63) { col = 63; } // clip so that 0 <= col <= 63 |
| final long c = numCoupons; |
| if (c == 0) { promoteEmptyToSparse(this); } |
| final long k = 1L << lgK; |
| final int row = (int) (hash0 & (k - 1L)); |
| int rowCol = (row << 6) | col; |
| |
| // Avoid the hash table's "empty" value which is (2^26 -1, 63) (all ones) by changing it |
| // to the pair (2^26 - 2, 63), which effectively merges the two cells. |
| // This case is *extremely* unlikely, but we might as well handle it. |
| // It can't happen at all if lgK (or maxLgK) < 26. |
| if (rowCol == -1) { rowCol ^= (1 << 6); } //set the LSB of row to 0 |
| |
| if ((c << 5) < (3L * k)) { updateSparse(this, rowCol); } |
| else { updateWindowed(this, rowCol); } |
| } |
| |
| //Used by union and in testing |
| void rowColUpdate(final int rowCol) { |
| final int col = rowCol & 63; |
| if (col < fiCol) { return; } // important speed optimization |
| final long c = numCoupons; |
| if (c == 0) { promoteEmptyToSparse(this); } |
| final long k = 1L << lgK; |
| if ((c << 5) < (3L * k)) { updateSparse(this, rowCol); } |
| else { updateWindowed(this, rowCol); } |
| } |
| |
| /** |
| * Return a human-readable string summary of this sketch |
| */ |
| @Override |
| public String toString() { |
| return toString(false); |
| } |
| |
| /** |
| * Return a human-readable string summary of this sketch |
| * @param detail include data detail |
| * @return a human-readable string summary of this sketch |
| */ |
| public String toString(final boolean detail) { |
| final int numPairs = (pairTable == null) ? 0 : pairTable.getNumPairs(); |
| final int seedHash = Short.toUnsignedInt(computeSeedHash(seed)); |
| final double errConst = mergeFlag ? log(2) : sqrt(log(2) / 2.0); |
| final double rse = errConst / Math.sqrt(1 << lgK); |
| final StringBuilder sb = new StringBuilder(); |
| sb.append("### CPD SKETCH - PREAMBLE:").append(LS); |
| sb.append(" Flavor : ").append(getFlavor()).append(LS); |
| sb.append(" LgK : ").append(lgK).append(LS); |
| sb.append(" Merge Flag : ").append(mergeFlag).append(LS); |
| sb.append(" Error Const : ").append(errConst).append(LS); |
| sb.append(" RSE : ").append(rse).append(LS); |
| sb.append(" Seed Hash : ").append(Integer.toHexString(seedHash)) |
| .append(" | ").append(seedHash).append(LS); |
| sb.append(" Num Coupons : ").append(numCoupons).append(LS); |
| sb.append(" Num Pairs (SV) : ").append(numPairs).append(LS); |
| sb.append(" First Inter Col: ").append(fiCol).append(LS); |
| sb.append(" Valid Window : ").append(slidingWindow != null).append(LS); |
| sb.append(" Valid PairTable: ").append(pairTable != null).append(LS); |
| sb.append(" Window Offset : ").append(windowOffset).append(LS); |
| sb.append(" KxP : ").append(kxp).append(LS); |
| sb.append(" HIP Accum : ").append(hipEstAccum).append(LS); |
| if (detail) { |
| sb.append(LS).append("### CPC SKETCH - DATA").append(LS); |
| if (pairTable != null) { |
| sb.append(pairTable.toString(true)); |
| } |
| if (slidingWindow != null) { |
| sb.append("SlidingWindow : ").append(LS); |
| sb.append(" Index Bits (lsb ->)").append(LS); |
| for (int i = 0; i < slidingWindow.length; i++) { |
| |
| final String bits = zeroPad(Integer.toBinaryString(slidingWindow[i] & 0XFF), 8); |
| sb.append(String.format("%9d %8s" + LS, i, bits)); |
| } |
| } |
| } |
| sb.append("### END CPC SKETCH"); |
| return sb.toString(); |
| } |
| |
| /** |
| * Returns a human readable string of the preamble of a byte array image of a CpcSketch. |
| * @param byteArr the given byte array |
| * @param detail if true, a dump of the compressed window and surprising value streams will be |
| * included. |
| * @return a human readable string of the preamble of a byte array image of a CpcSketch. |
| */ |
| public static String toString(final byte[] byteArr, final boolean detail) { |
| return PreambleUtil.toString(byteArr, detail); |
| } |
| |
| /** |
| * Returns a human readable string of the preamble of a Memory image of a CpcSketch. |
| * @param mem the given Memory |
| * @param detail if true, a dump of the compressed window and surprising value streams will be |
| * included. |
| * @return a human readable string of the preamble of a Memory image of a CpcSketch. |
| */ |
| public static String toString(final Memory mem, final boolean detail) { |
| return PreambleUtil.toString(mem, detail); |
| } |
| |
| private static void fillKxpByteLookup() { //called from static initializer |
| for (int b = 0; b < 256; b++) { |
| double sum = 0; |
| for (int col = 0; col < 8; col++) { |
| final int bit = (b >>> col) & 1; |
| if (bit == 0) { // note the inverted logic |
| sum += invPow2(col + 1); //note the "+1" |
| } |
| } |
| kxpByteLookup[b] = sum; |
| } |
| } |
| |
| static { |
| fillKxpByteLookup(); |
| } |
| |
| } |