blob: 760b97e7ccec77d244d8617335ee4ea47a1789a9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.hll;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.hll.HllUtil.KEY_BITS_26;
import static org.apache.datasketches.hll.HllUtil.KEY_MASK_26;
import org.apache.datasketches.memory.Memory;
/**
* Although this class is package-private, it provides a single place to define and document
* the common public API for both HllSketch and Union.
* @author Lee Rhodes
* @author Kevin Lang
*/
abstract class BaseHllSketch {
/**
* Gets the size in bytes of the current sketch when serialized using
* <i>toCompactByteArray()</i>.
* @return the size in bytes of the current sketch when serialized using
* <i>toCompactByteArray()</i>.
*/
public abstract int getCompactSerializationBytes();
/**
* This is less accurate than the {@link #getEstimate()} method and is automatically used
* when the sketch has gone through union operations where the more accurate HIP estimator
* cannot be used.
* This is made public only for error characterization software that exists in separate
* packages and is not intended for normal use.
* @return the composite estimate
*/
public abstract double getCompositeEstimate();
/**
* Returns the current mode of the sketch: LIST, SET, HLL
* @return the current mode of the sketch: LIST, SET, HLL
*/
abstract CurMode getCurMode();
/**
* Return the cardinality estimate
* @return the cardinality estimate
*/
public abstract double getEstimate();
/**
* Gets the {@link TgtHllType}
* @return the TgtHllType enum value
*/
public abstract TgtHllType getTgtHllType();
/**
* Gets the <i>lgConfigK</i>.
* @return the <i>lgConfigK</i>.
*/
public abstract int getLgConfigK();
/**
* Gets the approximate lower error bound given the specified number of Standard Deviations.
*
* @param numStdDev This must be an integer between 1 and 3, inclusive.
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of Standard Deviations</a>
* @return the lower bound.
*/
public abstract double getLowerBound(int numStdDev);
/**
* Returns the current serialization version.
* @return the current serialization version.
*/
public static final int getSerializationVersion() {
return PreambleUtil.SER_VER;
}
/**
* Returns the current serialization version of the given Memory.
* @param mem the given Memory containing a serialized HllSketch image.
* @return the current serialization version.
*/
public static final int getSerializationVersion(final Memory mem) {
return mem.getByte(PreambleUtil.SER_VER_BYTE) & 0XFF;
}
/**
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
* parameters. This is used primarily for testing.
* @param upperBound return the RE for the Upper Bound, otherwise for the Lower Bound.
* @param unioned set true if the sketch is the result of a union operation.
* @param lgConfigK the configured value for the sketch.
* @param numStdDev the given number of Standard Deviations. This must be an integer between
* 1 and 3, inclusive.
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
* @return the current (approximate) RelativeError
*/
public double getRelErr(final boolean upperBound, final boolean unioned,
final int lgConfigK, final int numStdDev) {
return RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev);
}
/**
* Gets the size in bytes of the current sketch when serialized using
* <i>toUpdatableByteArray()</i>.
* @return the size in bytes of the current sketch when serialized using
* <i>toUpdatableByteArray()</i>.
*/
public abstract int getUpdatableSerializationBytes();
/**
* Gets the approximate upper error bound given the specified number of Standard Deviations.
*
* @param numStdDev This must be an integer between 1 and 3, inclusive.
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
* @return the upper bound.
*/
public abstract double getUpperBound(int numStdDev);
/**
* Returns true if empty
* @return true if empty
*/
public abstract boolean isEmpty();
/**
* Returns true if the backing memory of this sketch is in compact form.
* @return true if the backing memory of this sketch is in compact form.
*/
public abstract boolean isCompact();
/**
* This HLL family of sketches and operators is always estimating, even for very small values.
* @return true
*/
public boolean isEstimationMode() {
return true;
}
/**
* Returns true if this sketch was created using Memory.
* @return true if this sketch was created using Memory.
*/
public abstract boolean isMemory();
/**
* Returns true if the backing memory for this sketch is off-heap.
* @return true if the backing memory for this sketch is off-heap.
*/
public abstract boolean isOffHeap();
/**
* Gets the Out-of-order flag.
* @return true if the current estimator is the non-HIP estimator, which is slightly less
* accurate than the HIP estimator.
*/
abstract boolean isOutOfOrderFlag();
/**
* Returns true if the given Memory refers to the same underlying resource as this sketch.
* The capacities must be the same. If <i>this</i> is a region,
* the region offset must also be the same.
*
* <p>This is only relevant for HLL_4 sketches that have been configured for off-heap
* using WritableMemory or Memory. For on-heap sketches or unions this will return false.
*
* <p>It is rare, but possible, the the off-heap memory that has been allocated to an HLL_4
* sketch may not be large enough. If this should happen, the sketch makes a request for more
* memory from the owner of the resource and then moves itself to this new location. This all
* happens transparently to the user. This method provides a means for the user to
* inquire of the sketch if it has, in fact, moved itself.
*
* @param mem the given Memory
* @return true if the given Memory refers to the same underlying resource as this sketch or
* union.
*/
public abstract boolean isSameResource(Memory mem);
/**
* Resets to empty, but does not change the configured values of lgConfigK and tgtHllType.
*/
public abstract void reset();
/**
* Serializes this sketch as a byte array in compact form. The compact form is smaller in size
* than the updatable form and read-only. It can be used in union operations as follows:
* <pre>
* Union union; HllSketch sk, sk2;
* int lgK = 12;
* sk = new HllSketch(lgK, TgtHllType.HLL_4); //can be 4, 6, or 8
* for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
* byte[] arr = HllSketch.toCompactByteArray();
* //...
* union = Union.heapify(arr); //initializes the union using data from the array.
* //OR, if used in an off-heap environment:
* union = Union.heapify(Memory.wrap(arr)); //same as above, except from Memory object.
*
* //To recover an updatable heap sketch:
* sk2 = HllSketch.heapify(arr);
* //OR, if used in an off-heap environment:
* sk2 = HllSketch.heapify(Memory.wrap(arr));
* </pre>
*
* <p>The sketch "wrapping" operation skips actual deserialization thus is quite fast. However,
* any attempt to update the derived HllSketch will result in a Read-only exception.
* @return this sketch as a compact byte array.
*/
public abstract byte[] toCompactByteArray();
/**
* Serializes this sketch as a byte array in an updatable form. The updatable form is larger than
* the compact form. The use of this form is primarily in environments that support updating
* sketches in off-heap memory. If the sketch is constructed using HLL_8, sketch updating and
* union updating operations can actually occur in WritableMemory, which can be off-heap:
* <pre>
* Union union; HllSketch sk;
* int lgK = 12;
* sk = new HllSketch(lgK, TgtHllType.HLL_8) //must be 8
* for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
* byte[] arr = sk.toUpdatableByteArray();
* WritableMemory wmem = WritableMemory.wrap(arr);
* //...
* union = Union.writableWrap(wmem); //no deserialization!
* </pre>
* @return this sketch as an updatable byte array.
*/
public abstract byte[] toUpdatableByteArray();
/**
* Human readable summary as a string.
* @return Human readable summary as a string.
*/
@Override
public String toString() {
return toString(true, false, false, false);
}
/**
* Human readable summary with optional detail. Does not list empty entries.
* @param summary if true, output the sketch summary
* @param detail if true, output the internal data array
* @param auxDetail if true, output the internal Aux array, if it exists.
* @return human readable string with optional detail.
*/
public String toString(final boolean summary, final boolean detail, final boolean auxDetail) {
return toString(summary, detail, auxDetail, false);
}
/**
* Human readable summary with optional detail
* @param summary if true, output the sketch summary
* @param detail if true, output the internal data array
* @param auxDetail if true, output the internal Aux array, if it exists.
* @param all if true, outputs all entries including empty ones
* @return human readable string with optional detail.
*/
public abstract String toString(boolean summary, boolean detail, boolean auxDetail,
boolean all);
/**
* Present the given long as a potential unique item.
*
* @param datum The given long datum.
*/
public void update(final long datum) {
final long[] data = { datum };
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given double (or float) datum as a potential unique item.
* The double will be converted to a long using Double.doubleToLongBits(datum),
* which normalizes all NaN values to a single NaN representation.
* Plus and minus zero will be normalized to plus zero.
* The special floating-point values NaN and +/- Infinity are treated as distinct.
*
* @param datum The given double datum.
*/
public void update(final double datum) {
final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given String as a potential unique item.
* The string is converted to a byte array using UTF8 encoding.
* If the string is null or empty no update attempt is made and the method returns.
*
* <p>Note: About 2X faster performance can be obtained by first converting the String to a
* char[] and updating the sketch with that. This bypasses the complexity of the Java UTF_8
* encoding. This, of course, will not produce the same internal hash values as updating directly
* with a String. So be consistent! Unioning two sketches, one fed with strings and the other
* fed with char[] will be meaningless.
* </p>
*
* @param datum The given String.
*/
public void update(final String datum) {
if ((datum == null) || datum.isEmpty()) { return; }
final byte[] data = datum.getBytes(UTF_8);
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given byte array as a potential unique item.
* If the byte array is null or empty no update attempt is made and the method returns.
*
* @param data The given byte array.
*/
public void update(final byte[] data) {
if ((data == null) || (data.length == 0)) { return; }
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given char array as a potential unique item.
* If the char array is null or empty no update attempt is made and the method returns.
*
* <p>Note: this will not produce the same output hash values as the {@link #update(String)}
* method but will be a little faster as it avoids the complexity of the UTF8 encoding.</p>
*
* @param data The given char array.
*/
public void update(final char[] data) {
if ((data == null) || (data.length == 0)) { return; }
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given integer array as a potential unique item.
* If the integer array is null or empty no update attempt is made and the method returns.
*
* @param data The given int array.
*/
public void update(final int[] data) {
if ((data == null) || (data.length == 0)) { return; }
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
/**
* Present the given long array as a potential unique item.
* If the long array is null or empty no update attempt is made and the method returns.
*
* @param data The given long array.
*/
public void update(final long[] data) {
if ((data == null) || (data.length == 0)) { return; }
couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
}
private static final int coupon(final long[] hash) {
final int addr26 = (int) ((hash[0] & KEY_MASK_26));
final int lz = Long.numberOfLeadingZeros(hash[1]);
final int value = ((lz > 62 ? 62 : lz) + 1);
return (value << KEY_BITS_26) | addr26;
}
abstract void couponUpdate(int coupon);
}