| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.util.hll; |
| |
| import java.util.Arrays; |
| |
| import com.carrotsearch.hppc.IntByteHashMap; |
| import com.carrotsearch.hppc.LongHashSet; |
| import com.carrotsearch.hppc.cursors.IntByteCursor; |
| import com.carrotsearch.hppc.cursors.LongCursor; |
| import org.apache.solr.util.LongIterator; |
| |
| /** |
| * A probabilistic set of hashed <code>long</code> elements. Useful for computing |
| * the approximate cardinality of a stream of data in very small storage. |
| * |
| * A modified version of the <a href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf"> |
| * 'HyperLogLog' data structure and algorithm</a> is used, which combines both |
| * probabilistic and non-probabilistic techniques to improve the accuracy and |
| * storage requirements of the original algorithm. |
| * |
| * More specifically, initializing and storing a new {@link HLL} will |
| * allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}). |
| * After adding the first few values, a sorted list of unique integers is |
| * stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can |
| * be sacrificed for memory footprint: the values in the sorted list are |
| * "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure. |
| * Finally, when enough registers are set, the map-based HLL will be converted |
| * to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure. |
| * |
| * This data structure is interoperable with the implementations found at: |
| * <ul> |
| * <li><a href="https://github.com/aggregateknowledge/postgresql-hll">postgresql-hll</a>, and</li> |
| * <li><a href="https://github.com/aggregateknowledge/js-hll">js-hll</a></li> |
| * </ul> |
| * when <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown">properly serialized</a>. |
| */ |
| public class HLL implements Cloneable { |
| // minimum and maximum values for the log-base-2 of the number of registers |
| // in the HLL |
| public static final int MINIMUM_LOG2M_PARAM = 4; |
| public static final int MAXIMUM_LOG2M_PARAM = 30; |
| |
| // minimum and maximum values for the register width of the HLL |
| public static final int MINIMUM_REGWIDTH_PARAM = 1; |
| public static final int MAXIMUM_REGWIDTH_PARAM = 8; |
| |
| // minimum and maximum values for the 'expthresh' parameter of the |
| // constructor that is meant to match the PostgreSQL implementation's |
| // constructor and parameter names |
| public static final int MINIMUM_EXPTHRESH_PARAM = -1; |
| public static final int MAXIMUM_EXPTHRESH_PARAM = 18; |
| public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/); |
| |
| // ************************************************************************ |
| // Storage |
| // storage used when #type is EXPLICIT, null otherwise |
| LongHashSet explicitStorage; |
| // storage used when #type is SPARSE, null otherwise |
| IntByteHashMap sparseProbabilisticStorage; |
| // storage used when #type is FULL, null otherwise |
| BitVector probabilisticStorage; |
| |
| // current type of this HLL instance, if this changes then so should the |
| // storage used (see above) |
| private HLLType type; |
| |
| // ------------------------------------------------------------------------ |
| // Characteristic parameters |
| // NOTE: These members are named to match the PostgreSQL implementation's |
| // parameters. |
| // log2(the number of probabilistic HLL registers) |
| private final int log2m; |
| // the size (width) each register in bits |
| private final int regwidth; |
| |
| // ------------------------------------------------------------------------ |
| // Computed constants |
| // ........................................................................ |
| // EXPLICIT-specific constants |
| // flag indicating if the EXPLICIT representation should NOT be used |
| private final boolean explicitOff; |
| // flag indicating that the promotion threshold from EXPLICIT should be |
| // computed automatically |
| // NOTE: this only has meaning when 'explicitOff' is false |
| private final boolean explicitAuto; |
| // threshold (in element count) at which a EXPLICIT HLL is converted to a |
| // SPARSE or FULL HLL, always greater than or equal to zero and always a |
| // power of two OR simply zero |
| // NOTE: this only has meaning when 'explicitOff' is false |
| private final int explicitThreshold; |
| |
| // ........................................................................ |
| // SPARSE-specific constants |
| // the computed width of the short words |
| private final int shortWordLength; |
| // flag indicating if the SPARSE representation should not be used |
| private final boolean sparseOff; |
| // threshold (in register count) at which a SPARSE HLL is converted to a |
| // FULL HLL, always greater than zero |
| private final int sparseThreshold; |
| |
| // ........................................................................ |
| // Probabilistic algorithm constants |
| // the number of registers, will always be a power of 2 |
| private final int m; |
| // a mask of the log2m bits set to one and the rest to zero |
| private final int mBitsMask; |
| // a mask as wide as a register (see #fromBytes()) |
| private final int valueMask; |
| // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw()) |
| private final long pwMaxMask; |
| // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator") |
| private final double alphaMSquared; |
| // the cutoff value of the estimator for using the "small" range cardinality |
| // correction formula |
| private final double smallEstimatorCutoff; |
| // the cutoff value of the estimator for using the "large" range cardinality |
| // correction formula |
| private final double largeEstimatorCutoff; |
| |
| // ======================================================================== |
| /** |
| * NOTE: Arguments here are named and structured identically to those in the |
| * PostgreSQL implementation, which can be found |
| * <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning">here</a>. |
| * |
| * @param log2m log-base-2 of the number of registers used in the HyperLogLog |
| * algorithm. Must be at least 4 and at most 30. |
| * @param regwidth number of bits used per register in the HyperLogLog |
| * algorithm. Must be at least 1 and at most 8. |
| * @param expthresh tunes when the {@link HLLType#EXPLICIT} to |
| * {@link HLLType#SPARSE} promotion occurs, |
| * based on the set's cardinality. Must be at least -1 and at most 18. |
| * @param sparseon Flag indicating if the {@link HLLType#SPARSE} |
| * representation should be used. |
| * @param type the type in the promotion hierarchy which this instance should |
| * start at. This cannot be <code>null</code>. |
| */ |
| public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) { |
| this.log2m = log2m; |
| if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) { |
| throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")"); |
| } |
| |
| this.regwidth = regwidth; |
| if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) { |
| throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")"); |
| } |
| |
| this.m = (1 << log2m); |
| this.mBitsMask = m - 1; |
| this.valueMask = (1 << regwidth) - 1; |
| this.pwMaxMask = HLLUtil.pwMaxMask(regwidth); |
| this.alphaMSquared = HLLUtil.alphaMSquared(m); |
| this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m); |
| this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth); |
| |
| if(expthresh == -1) { |
| this.explicitAuto = true; |
| this.explicitOff = false; |
| |
| // NOTE: This math matches the size calculation in the PostgreSQL impl. |
| final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE; |
| final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/); |
| |
| if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) { |
| this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD; |
| } else { |
| this.explicitThreshold = numLongs; |
| } |
| } else if(expthresh == 0) { |
| this.explicitAuto = false; |
| this.explicitOff = true; |
| this.explicitThreshold = 0; |
| } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){ |
| this.explicitAuto = false; |
| this.explicitOff = false; |
| this.explicitThreshold = (1 << (expthresh - 1)); |
| } else { |
| throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")"); |
| } |
| |
| this.shortWordLength = (regwidth + log2m); |
| this.sparseOff = !sparseon; |
| if(this.sparseOff) { |
| this.sparseThreshold = 0; |
| } else { |
| // TODO improve this cutoff to include the cost overhead of Java |
| // members/objects |
| final int largestPow2LessThanCutoff = |
| (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength); |
| this.sparseThreshold = (1 << largestPow2LessThanCutoff); |
| } |
| |
| initializeStorage(type); |
| } |
| |
| /** |
| * Construct an empty HLL with the given {@code log2m} and {@code regwidth}. |
| * |
| * This is equivalent to calling <code>HLL(log2m, regwidth, -1, true, HLLType.EMPTY)</code>. |
| * |
| * @param log2m log-base-2 of the number of registers used in the HyperLogLog |
| * algorithm. Must be at least 4 and at most 30. |
| * @param regwidth number of bits used per register in the HyperLogLog |
| * algorithm. Must be at least 1 and at most 8. |
| * |
| * @see #HLL(int, int, int, boolean, HLLType) |
| */ |
| public HLL(final int log2m, final int regwidth) { |
| this(log2m, regwidth, -1, true, HLLType.EMPTY); |
| } |
| |
| // ------------------------------------------------------------------------- |
| /** |
| * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT} |
| * and {@link HLLType#SPARSE} representations should be enabled. |
| * |
| * @param log2m log-base-2 of the number of registers used in the HyperLogLog |
| * algorithm. Must be at least 4 and at most 30. |
| * @param regwidth number of bits used per register in the HyperLogLog |
| * algorithm. Must be at least 1 and at most 8. |
| * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT} |
| * representation should be promoted to {@link HLLType#SPARSE}. |
| * This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}. |
| * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE} |
| * representation should be promoted to {@link HLLType#FULL}. |
| * This must be greater than zero. |
| * @param type the type in the promotion hierarchy which this instance should |
| * start at. This cannot be <code>null</code>. |
| */ |
| /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) { |
| this.log2m = log2m; |
| if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) { |
| throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")"); |
| } |
| |
| this.regwidth = regwidth; |
| if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) { |
| throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")"); |
| } |
| |
| this.m = (1 << log2m); |
| this.mBitsMask = m - 1; |
| this.valueMask = (1 << regwidth) - 1; |
| this.pwMaxMask = HLLUtil.pwMaxMask(regwidth); |
| this.alphaMSquared = HLLUtil.alphaMSquared(m); |
| this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m); |
| this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth); |
| |
| this.explicitAuto = false; |
| this.explicitOff = false; |
| this.explicitThreshold = explicitThreshold; |
| if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) { |
| throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")"); |
| } |
| |
| this.shortWordLength = (regwidth + log2m); |
| this.sparseOff = false; |
| this.sparseThreshold = sparseThreshold; |
| |
| initializeStorage(type); |
| } |
| |
| /** |
| * @return the type in the promotion hierarchy of this instance. This will |
| * never be <code>null</code>. |
| */ |
| public HLLType getType() { return type; } |
| |
| // ======================================================================== |
| // Add |
| /** |
| * Adds <code>rawValue</code> directly to the HLL. |
| * |
| * @param rawValue the value to be added. It is very important that this |
| * value <em>already be hashed</em> with a strong (but not |
| * necessarily cryptographic) hash function. For instance, the |
| * Murmur3 implementation in |
| * <a href="http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java"> |
| * Google's Guava</a> library is an excellent hash function for this |
| * purpose and, for seeds greater than zero, matches the output |
| * of the hash provided in the PostgreSQL implementation. |
| */ |
| public void addRaw(final long rawValue) { |
| switch(type) { |
| case EMPTY: { |
| // NOTE: EMPTY type is always promoted on #addRaw() |
| if(explicitThreshold > 0) { |
| initializeStorage(HLLType.EXPLICIT); |
| explicitStorage.add(rawValue); |
| } else if(!sparseOff) { |
| initializeStorage(HLLType.SPARSE); |
| addRawSparseProbabilistic(rawValue); |
| } else { |
| initializeStorage(HLLType.FULL); |
| addRawProbabilistic(rawValue); |
| } |
| return; |
| } |
| case EXPLICIT: { |
| explicitStorage.add(rawValue); |
| |
| // promotion, if necessary |
| if(explicitStorage.size() > explicitThreshold) { |
| if(!sparseOff) { |
| initializeStorage(HLLType.SPARSE); |
| for (LongCursor c : explicitStorage) { |
| addRawSparseProbabilistic(c.value); |
| } |
| } else { |
| initializeStorage(HLLType.FULL); |
| for (LongCursor c : explicitStorage) { |
| addRawProbabilistic(c.value); |
| } |
| } |
| explicitStorage = null; |
| } |
| return; |
| } |
| case SPARSE: { |
| addRawSparseProbabilistic(rawValue); |
| |
| // promotion, if necessary |
| if(sparseProbabilisticStorage.size() > sparseThreshold) { |
| initializeStorage(HLLType.FULL); |
| for(IntByteCursor c : sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| sparseProbabilisticStorage = null; |
| } |
| return; |
| } |
| case FULL: |
| addRawProbabilistic(rawValue); |
| return; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| } |
| |
| // ------------------------------------------------------------------------ |
| // #addRaw(..) helpers |
| /** |
| * Adds the raw value to the {@link #sparseProbabilisticStorage}. |
| * {@link #type} must be {@link HLLType#SPARSE}. |
| * |
| * @param rawValue the raw value to add to the sparse storage. |
| */ |
| private void addRawSparseProbabilistic(final long rawValue) { |
| // p(w): position of the least significant set bit (one-indexed) |
| // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value) |
| // |
| // By construction of pwMaxMask (see #Constructor()), |
| // lsb(pwMaxMask) = 2^(registerValueInBits) - 2, |
| // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2, |
| // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1. |
| final long substreamValue = (rawValue >>> log2m); |
| final byte p_w; |
| |
| if(substreamValue == 0L) { |
| // The paper does not cover p(0x0), so the special value 0 is used. |
| // 0 is the original initialization value of the registers, so by |
| // doing this the multiset simply ignores it. This is acceptable |
| // because the probability is 1/(2^(2^registerSizeInBits)). |
| p_w = 0; |
| } else { |
| p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask)); |
| } |
| |
| // Short-circuit if the register is being set to zero, since algorithmically |
| // this corresponds to an "unset" register, and "unset" registers aren't |
| // stored to save memory. (The very reason this sparse implementation |
| // exists.) If a register is set to zero it will break the #algorithmCardinality |
| // code. |
| if(p_w == 0) { |
| return; |
| } |
| |
| // NOTE: no +1 as in paper since 0-based indexing |
| final int j = (int)(rawValue & mBitsMask); |
| |
| final byte currentValue; |
| final int index = sparseProbabilisticStorage.indexOf(j); |
| if (index >= 0) { |
| currentValue = sparseProbabilisticStorage.indexGet(index); |
| } else { |
| currentValue = 0; |
| } |
| |
| if(p_w > currentValue) { |
| sparseProbabilisticStorage.put(j, p_w); |
| } |
| } |
| |
| /** |
| * Adds the raw value to the {@link #probabilisticStorage}. |
| * {@link #type} must be {@link HLLType#FULL}. |
| * |
| * @param rawValue the raw value to add to the full probabilistic storage. |
| */ |
| private void addRawProbabilistic(final long rawValue) { |
| // p(w): position of the least significant set bit (one-indexed) |
| // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value) |
| // |
| // By construction of pwMaxMask (see #Constructor()), |
| // lsb(pwMaxMask) = 2^(registerValueInBits) - 2, |
| // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2, |
| // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1. |
| final long substreamValue = (rawValue >>> log2m); |
| final byte p_w; |
| |
| if (substreamValue == 0L) { |
| // The paper does not cover p(0x0), so the special value 0 is used. |
| // 0 is the original initialization value of the registers, so by |
| // doing this the multiset simply ignores it. This is acceptable |
| // because the probability is 1/(2^(2^registerSizeInBits)). |
| p_w = 0; |
| } else { |
| p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask)); |
| } |
| |
| // Short-circuit if the register is being set to zero, since algorithmically |
| // this corresponds to an "unset" register, and "unset" registers aren't |
| // stored to save memory. (The very reason this sparse implementation |
| // exists.) If a register is set to zero it will break the #algorithmCardinality |
| // code. |
| if(p_w == 0) { |
| return; |
| } |
| |
| // NOTE: no +1 as in paper since 0-based indexing |
| final int j = (int)(rawValue & mBitsMask); |
| |
| probabilisticStorage.setMaxRegister(j, p_w); |
| } |
| |
| // ------------------------------------------------------------------------ |
| // Storage helper |
| /** |
| * Initializes storage for the specified {@link HLLType} and changes the |
| * instance's {@link #type}. |
| * |
| * @param type the {@link HLLType} to initialize storage for. This cannot be |
| * <code>null</code> and must be an instantiable type. |
| */ |
| private void initializeStorage(final HLLType type) { |
| this.type = type; |
| switch(type) { |
| case EMPTY: |
| // nothing to be done |
| break; |
| case EXPLICIT: |
| this.explicitStorage = new LongHashSet(); |
| break; |
| case SPARSE: |
| this.sparseProbabilisticStorage = new IntByteHashMap(); |
| break; |
| case FULL: |
| this.probabilisticStorage = new BitVector(regwidth, m); |
| break; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| } |
| |
| // ======================================================================== |
| // Cardinality |
| /** |
| * Computes the cardinality of the HLL. |
| * |
| * @return the cardinality of HLL. This will never be negative. |
| */ |
| public long cardinality() { |
| switch(type) { |
| case EMPTY: |
| return 0/*by definition*/; |
| case EXPLICIT: |
| return explicitStorage.size(); |
| case SPARSE: |
| return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality()); |
| case FULL: |
| return (long)Math.ceil(fullProbabilisticAlgorithmCardinality()); |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| } |
| |
| // ------------------------------------------------------------------------ |
| // Cardinality helpers |
| /** |
| * Computes the exact cardinality value returned by the HLL algorithm when |
| * represented as a {@link HLLType#SPARSE} HLL. Kept |
| * separate from {@link #cardinality()} for testing purposes. {@link #type} |
| * must be {@link HLLType#SPARSE}. |
| * |
| * @return the exact, unrounded cardinality given by the HLL algorithm |
| */ |
| /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() { |
| final int m = this.m/*for performance*/; |
| |
| // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the |
| // 'j'th register value |
| double sum = 0; |
| int numberOfZeroes = 0/*"V" in the paper*/; |
| for(int j=0; j<m; j++) { |
| final long register; |
| if (sparseProbabilisticStorage.containsKey(j)) { |
| register = sparseProbabilisticStorage.get(j); |
| } else { |
| register = 0; |
| } |
| |
| sum += 1.0 / (1L << register); |
| if(register == 0L) numberOfZeroes++; |
| } |
| |
| // apply the estimate and correction to the indicator function |
| final double estimator = alphaMSquared / sum; |
| if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) { |
| return HLLUtil.smallEstimator(m, numberOfZeroes); |
| } else if(estimator <= largeEstimatorCutoff) { |
| return estimator; |
| } else { |
| return HLLUtil.largeEstimator(log2m, regwidth, estimator); |
| } |
| } |
| |
| /** |
| * Computes the exact cardinality value returned by the HLL algorithm when |
| * represented as a {@link HLLType#FULL} HLL. Kept |
| * separate from {@link #cardinality()} for testing purposes. {@link #type} |
| * must be {@link HLLType#FULL}. |
| * |
| * @return the exact, unrounded cardinality given by the HLL algorithm |
| */ |
| /*package, for testing*/ double fullProbabilisticAlgorithmCardinality() { |
| final int m = this.m/*for performance*/; |
| |
| // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the |
| // 'j'th register value |
| double sum = 0; |
| int numberOfZeroes = 0/*"V" in the paper*/; |
| final LongIterator iterator = probabilisticStorage.registerIterator(); |
| while(iterator.hasNext()) { |
| final long register = iterator.next(); |
| |
| sum += 1.0 / (1L << register); |
| if(register == 0L) numberOfZeroes++; |
| } |
| |
| // apply the estimate and correction to the indicator function |
| final double estimator = alphaMSquared / sum; |
| if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) { |
| return HLLUtil.smallEstimator(m, numberOfZeroes); |
| } else if(estimator <= largeEstimatorCutoff) { |
| return estimator; |
| } else { |
| return HLLUtil.largeEstimator(log2m, regwidth, estimator); |
| } |
| } |
| |
| // ======================================================================== |
| // Clear |
| /** |
| * Clears the HLL. The HLL will have cardinality zero and will act as if no |
| * elements have been added. |
| * |
| * NOTE: Unlike {@link #addRaw(long)}, <code>clear</code> does NOT handle |
| * transitions between {@link HLLType}s - a probabilistic type will remain |
| * probabilistic after being cleared. |
| */ |
| public void clear() { |
| switch(type) { |
| case EMPTY: |
| return /*do nothing*/; |
| case EXPLICIT: |
| explicitStorage.clear(); |
| return; |
| case SPARSE: |
| sparseProbabilisticStorage.clear(); |
| return; |
| case FULL: |
| probabilisticStorage.fill(0); |
| return; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| } |
| |
| // ======================================================================== |
| // Union |
| /** |
| * Computes the union of HLLs and stores the result in this instance. |
| * |
| * @param other the other {@link HLL} instance to union into this one. This |
| * cannot be <code>null</code>. |
| */ |
| public void union(final HLL other) { |
| // TODO: verify HLLs are compatible |
| final HLLType otherType = other.getType(); |
| |
| if(type.equals(otherType)) { |
| homogeneousUnion(other); |
| return; |
| } else { |
| heterogenousUnion(other); |
| return; |
| } |
| } |
| |
| // ------------------------------------------------------------------------ |
| // Union helpers |
| /** |
| * Computes the union of two HLLs, of different types, and stores the |
| * result in this instance. |
| * |
| * @param other the other {@link HLL} instance to union into this one. This |
| * cannot be <code>null</code>. |
| */ |
| /*package, for testing*/ void heterogenousUnion(final HLL other) { |
| /* |
| * The logic here is divided into two sections: unions with an EMPTY |
| * HLL, and unions between EXPLICIT/SPARSE/FULL |
| * HLL. |
| * |
| * Between those two sections, all possible heterogeneous unions are |
| * covered. Should another type be added to HLLType whose unions |
| * are not easily reduced (say, as EMPTY's are below) this may be more |
| * easily implemented as Strategies. However, that is unnecessary as it |
| * stands. |
| */ |
| |
| // .................................................................... |
| // Union with an EMPTY |
| if(HLLType.EMPTY.equals(type)) { |
| // NOTE: The union of empty with non-empty HLL is just a |
| // clone of the non-empty. |
| |
| switch(other.getType()) { |
| case EXPLICIT: { |
| // src: EXPLICIT |
| // dest: EMPTY |
| |
| if(other.explicitStorage.size() <= explicitThreshold) { |
| type = HLLType.EXPLICIT; |
| explicitStorage = other.explicitStorage.clone(); |
| } else { |
| if(!sparseOff) { |
| initializeStorage(HLLType.SPARSE); |
| } else { |
| initializeStorage(HLLType.FULL); |
| } |
| for(LongCursor c : other.explicitStorage) { |
| addRaw(c.value); |
| } |
| } |
| return; |
| } |
| case SPARSE: { |
| // src: SPARSE |
| // dest: EMPTY |
| |
| if(!sparseOff) { |
| type = HLLType.SPARSE; |
| sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone(); |
| } else { |
| initializeStorage(HLLType.FULL); |
| for(IntByteCursor c : other.sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| } |
| return; |
| } |
| default/*case FULL*/: { |
| // src: FULL |
| // dest: EMPTY |
| |
| type = HLLType.FULL; |
| probabilisticStorage = other.probabilisticStorage.clone(); |
| return; |
| } |
| } |
| } else if (HLLType.EMPTY.equals(other.getType())) { |
| // source is empty, so just return destination since it is unchanged |
| return; |
| } /* else -- both of the sets are not empty */ |
| |
| // .................................................................... |
| // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below |
| switch(type) { |
| case EXPLICIT: { |
| // src: FULL/SPARSE |
| // dest: EXPLICIT |
| // "Storing into destination" cannot be done (since destination |
| // is by definition of smaller capacity than source), so a clone |
| // of source is made and values from destination are inserted |
| // into that. |
| |
| // Determine source and destination storage. |
| // NOTE: destination storage may change through promotion if |
| // source is SPARSE. |
| if(HLLType.SPARSE.equals(other.getType())) { |
| if(!sparseOff) { |
| type = HLLType.SPARSE; |
| sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone(); |
| } else { |
| initializeStorage(HLLType.FULL); |
| for(IntByteCursor c : other.sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| } |
| } else /*source is HLLType.FULL*/ { |
| type = HLLType.FULL; |
| probabilisticStorage = other.probabilisticStorage.clone(); |
| } |
| for(LongCursor c : explicitStorage) { |
| addRaw(c.value); |
| } |
| explicitStorage = null; |
| return; |
| } |
| case SPARSE: { |
| if(HLLType.EXPLICIT.equals(other.getType())) { |
| // src: EXPLICIT |
| // dest: SPARSE |
| // Add the raw values from the source to the destination. |
| |
| for(LongCursor c : other.explicitStorage) { |
| addRaw(c.value); |
| } |
| // NOTE: addRaw will handle promotion cleanup |
| } else /*source is HLLType.FULL*/ { |
| // src: FULL |
| // dest: SPARSE |
| // "Storing into destination" cannot be done (since destination |
| // is by definition of smaller capacity than source), so a |
| // clone of source is made and registers from the destination |
| // are merged into the clone. |
| |
| type = HLLType.FULL; |
| probabilisticStorage = other.probabilisticStorage.clone(); |
| for(IntByteCursor c : sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| sparseProbabilisticStorage = null; |
| } |
| return; |
| } |
| default/*destination is HLLType.FULL*/: { |
| if(HLLType.EXPLICIT.equals(other.getType())) { |
| // src: EXPLICIT |
| // dest: FULL |
| // Add the raw values from the source to the destination. |
| // Promotion is not possible, so don't bother checking. |
| |
| for(LongCursor c : other.explicitStorage) { |
| addRaw(c.value); |
| } |
| } else /*source is HLLType.SPARSE*/ { |
| // src: SPARSE |
| // dest: FULL |
| // Merge the registers from the source into the destination. |
| // Promotion is not possible, so don't bother checking. |
| |
| for(IntByteCursor c : other.sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Computes the union of two HLLs of the same type, and stores the |
| * result in this instance. |
| * |
| * @param other the other {@link HLL} instance to union into this one. This |
| * cannot be <code>null</code>. |
| */ |
| private void homogeneousUnion(final HLL other) { |
| switch(type) { |
| case EMPTY: |
| // union of empty and empty is empty |
| return; |
| case EXPLICIT: |
| for(LongCursor c : other.explicitStorage) { |
| addRaw(c.value); |
| } |
| // NOTE: #addRaw() will handle promotion, if necessary |
| return; |
| case SPARSE: |
| for(IntByteCursor c : other.sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex); |
| if(registerValue > currentRegisterValue) { |
| sparseProbabilisticStorage.put(registerIndex, registerValue); |
| } |
| } |
| |
| // promotion, if necessary |
| if(sparseProbabilisticStorage.size() > sparseThreshold) { |
| initializeStorage(HLLType.FULL); |
| for(IntByteCursor c : sparseProbabilisticStorage) { |
| final int registerIndex = c.key; |
| final byte registerValue = c.value; |
| probabilisticStorage.setMaxRegister(registerIndex, registerValue); |
| } |
| sparseProbabilisticStorage = null; |
| } |
| return; |
| case FULL: |
| for(int i=0; i<m; i++) { |
| final long registerValue = other.probabilisticStorage.getRegister(i); |
| probabilisticStorage.setMaxRegister(i, registerValue); |
| } |
| return; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| } |
| |
| // ======================================================================== |
| // Serialization |
| /** |
| * Serializes the HLL to an array of bytes in correspondence with the format |
| * of the default schema version, {@link SerializationUtil#DEFAULT_SCHEMA_VERSION}. |
| * |
| * @return the array of bytes representing the HLL. This will never be |
| * <code>null</code> or empty. |
| */ |
| public byte[] toBytes() { |
| return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION); |
| } |
| |
| /** |
| * Serializes the HLL to an array of bytes in correspondence with the format |
| * of the specified schema version. |
| * |
| * @param schemaVersion the schema version dictating the serialization format |
| * @return the array of bytes representing the HLL. This will never be |
| * <code>null</code> or empty. |
| */ |
| public byte[] toBytes(final ISchemaVersion schemaVersion) { |
| final byte[] bytes; |
| switch(type) { |
| case EMPTY: |
| bytes = new byte[schemaVersion.paddingBytes(type)]; |
| break; |
| case EXPLICIT: { |
| final IWordSerializer serializer = |
| schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size()); |
| |
| final long[] values = explicitStorage.toArray(); |
| Arrays.sort(values); |
| for(final long value : values) { |
| serializer.writeWord(value); |
| } |
| |
| bytes = serializer.getBytes(); |
| break; |
| } |
| case SPARSE: { |
| final IWordSerializer serializer = |
| schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size()); |
| |
| final int[] indices = sparseProbabilisticStorage.keys().toArray(); |
| Arrays.sort(indices); |
| for(final int registerIndex : indices) { |
| assert sparseProbabilisticStorage.containsKey(registerIndex); |
| final long registerValue = sparseProbabilisticStorage.get(registerIndex); |
| // pack index and value into "short word" |
| final long shortWord = ((registerIndex << regwidth) | registerValue); |
| serializer.writeWord(shortWord); |
| } |
| |
| bytes = serializer.getBytes(); |
| break; |
| } |
| case FULL: { |
| final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m); |
| probabilisticStorage.getRegisterContents(serializer); |
| |
| bytes = serializer.getBytes(); |
| break; |
| } |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| |
| final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(), |
| type, |
| log2m, |
| regwidth, |
| (int)NumberUtil.log2(explicitThreshold), |
| explicitOff, |
| explicitAuto, |
| !sparseOff); |
| schemaVersion.writeMetadata(bytes, metadata); |
| |
| return bytes; |
| } |
| |
| /** |
| * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized |
| * into <code>bytes</code>. |
| * |
| * @param bytes the serialized bytes of new HLL |
| * @return the deserialized HLL. This will never be <code>null</code>. |
| * |
| * @see #toBytes(ISchemaVersion) |
| */ |
| public static HLL fromBytes(final byte[] bytes) { |
| final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes); |
| final IHLLMetadata metadata = schemaVersion.readMetadata(bytes); |
| |
| final HLLType type = metadata.HLLType(); |
| final int regwidth = metadata.registerWidth(); |
| final int log2m = metadata.registerCountLog2(); |
| final boolean sparseon = metadata.sparseEnabled(); |
| |
| final int expthresh; |
| if(metadata.explicitAuto()) { |
| expthresh = -1; |
| } else if(metadata.explicitOff()) { |
| expthresh = 0; |
| } else { |
| // NOTE: take into account that the postgres-compatible constructor |
| // subtracts one before taking a power of two. |
| expthresh = metadata.log2ExplicitCutoff() + 1; |
| } |
| |
| final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type); |
| |
| // Short-circuit on empty, which needs no other deserialization. |
| if(HLLType.EMPTY.equals(type)) { |
| return hll; |
| } |
| |
| final int wordLength; |
| switch(type) { |
| case EXPLICIT: |
| wordLength = Long.SIZE; |
| break; |
| case SPARSE: |
| wordLength = hll.shortWordLength; |
| break; |
| case FULL: |
| wordLength = hll.regwidth; |
| break; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| |
| final IWordDeserializer deserializer = |
| schemaVersion.getDeserializer(type, wordLength, bytes); |
| switch(type) { |
| case EXPLICIT: |
| // NOTE: This should not exceed expthresh and this will always |
| // be exactly the number of words that were encoded, |
| // because the word length is at least a byte wide. |
| // SEE: IWordDeserializer#totalWordCount() |
| for(int i=0; i<deserializer.totalWordCount(); i++) { |
| hll.explicitStorage.add(deserializer.readWord()); |
| } |
| break; |
| case SPARSE: |
| // NOTE: If the shortWordLength were smaller than 8 bits |
| // (1 byte) there would be a possibility (because of |
| // padding arithmetic) of having one or more extra |
| // registers read. However, this is not relevant as the |
| // extra registers will be all zeroes, which are ignored |
| // in the sparse representation. |
| for(int i=0; i<deserializer.totalWordCount(); i++) { |
| final long shortWord = deserializer.readWord(); |
| final byte registerValue = (byte)(shortWord & hll.valueMask); |
| // Only set non-zero registers. |
| if (registerValue != 0) { |
| hll.sparseProbabilisticStorage.put((int)(shortWord >>> hll.regwidth), registerValue); |
| } |
| } |
| break; |
| case FULL: |
| // NOTE: Iteration is done using m (register count) and NOT |
| // deserializer#totalWordCount() because regwidth may be |
| // less than 8 and as such the padding on the 'last' byte |
| // may be larger than regwidth, causing an extra register |
| // to be read. |
| // SEE: IWordDeserializer#totalWordCount() |
| for(long i=0; i<hll.m; i++) { |
| hll.probabilisticStorage.setRegister(i, deserializer.readWord()); |
| } |
| break; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| |
| return hll; |
| } |
| |
| /** |
| * Create a deep copy of this HLL. |
| * |
| * @see java.lang.Object#clone() |
| */ |
| @Override |
| public HLL clone() throws CloneNotSupportedException { |
| // NOTE: Since the package-only constructor assumes both explicit and |
| // sparse are enabled, the easiest thing to do here is to re-derive |
| // the expthresh parameter and create a new HLL with the public |
| // constructor. |
| // TODO: add a more sensible constructor to make this less obfuscated |
| final int copyExpthresh; |
| if(explicitAuto) { |
| copyExpthresh = -1; |
| } else if(explicitOff) { |
| copyExpthresh = 0; |
| } else { |
| // explicitThreshold is defined as: |
| // |
| // this.explicitThreshold = (1 << (expthresh - 1)); |
| // |
| // Since explicitThreshold is a power of two and only has a single |
| // bit set, finding the LSB is the same as finding the inverse |
| copyExpthresh = BitUtil.leastSignificantBit(explicitThreshold) + 1; |
| } |
| final HLL copy = new HLL(log2m, regwidth, copyExpthresh, !sparseOff/*sparseOn*/, type); |
| switch(type) { |
| case EMPTY: |
| // nothing to be done |
| break; |
| case EXPLICIT: |
| copy.explicitStorage = this.explicitStorage.clone(); |
| break; |
| case SPARSE: |
| copy.sparseProbabilisticStorage = this.sparseProbabilisticStorage.clone(); |
| break; |
| case FULL: |
| copy.probabilisticStorage = this.probabilisticStorage.clone(); |
| break; |
| default: |
| throw new RuntimeException("Unsupported HLL type " + type); |
| } |
| return copy; |
| } |
| } |