blob: 36cb5f077f9d62b224460bac89e95fccba36ef1d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.hll;
import com.google.common.collect.Collections2;
import com.google.common.collect.Lists;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.logger.Logger;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.security.MessageDigest;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Predicate;
/**
*
*/
public class HyperLogLogCollectorTest
{
private static final Logger log = new Logger(HyperLogLogCollectorTest.class);
private final HashFunction fn = Hashing.murmur3_128();
private static void fillBuckets(HyperLogLogCollector collector, byte startOffset, byte endOffset)
{
byte offset = startOffset;
while (offset <= endOffset) {
// fill buckets to shift registerOffset
for (short bucket = 0; bucket < 2048; ++bucket) {
collector.add(bucket, offset);
}
offset++;
}
}
@Test
public void testFolding()
{
final Random random = new Random(0);
final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000};
for (int numThings : numValsToCheck) {
HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < numThings; ++i) {
byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes();
allCombined.add(hashedVal);
if (i % 2 == 0) {
oneHalf.add(hashedVal);
} else {
otherHalf.add(hashedVal);
}
}
HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector();
folded.fold(oneHalf);
Assert.assertEquals(oneHalf, folded);
Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d);
folded.fold(otherHalf);
Assert.assertEquals(allCombined, folded);
Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d);
}
}
/**
* This is a very long running test, disabled by default.
* It is meant to catch issues when combining a large numer of HLL objects.
*
* It compares adding all the values to one HLL vs.
* splitting up values into HLLs of 100 values each, and folding those HLLs into a single main HLL.
*
* When reaching very large cardinalities (>> 50,000,000), offsets are mismatched between the main HLL and the ones
* with 100 values, requiring a floating max as described in
* https://druid.apache.org/blog/2014/02/18/hyperloglog-optimizations-for-real-world-systems.html
*/
@Ignore
@Test
public void testHighCardinalityRollingFold() throws Exception
{
final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector();
final HyperLogLogCollector simple = HyperLogLogCollector.makeLatestCollector();
MessageDigest md = MessageDigest.getInstance("SHA-1");
HyperLogLogCollector tmp = HyperLogLogCollector.makeLatestCollector();
int count;
for (count = 0; count < 100_000_000; ++count) {
md.update(StringUtils.toUtf8(Integer.toString(count)));
byte[] hashed = fn.hashBytes(md.digest()).asBytes();
tmp.add(hashed);
simple.add(hashed);
if (count % 100 == 0) {
rolling.fold(tmp);
tmp = HyperLogLogCollector.makeLatestCollector();
}
}
int n = count;
log.info("True cardinality " + n);
log.info("Rolling buffer cardinality " + rolling.estimateCardinality());
log.info("Simple buffer cardinality " + simple.estimateCardinality());
log.info("Rolling cardinality estimate off by %4.1f%%", 100 * (1 - rolling.estimateCardinality() / n));
Assert.assertEquals(n, simple.estimateCardinality(), n * 0.05);
Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05);
}
@Ignore
@Test
public void testHighCardinalityRollingFold2()
{
final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector();
int count;
long start = System.currentTimeMillis();
for (count = 0; count < 50_000_000; ++count) {
HyperLogLogCollector theCollector = HyperLogLogCollector.makeLatestCollector();
theCollector.add(fn.hashLong(count).asBytes());
rolling.fold(theCollector);
}
log.info("testHighCardinalityRollingFold2 took %d ms", System.currentTimeMillis() - start);
int n = count;
log.info("True cardinality " + n);
log.info("Rolling buffer cardinality " + rolling.estimateCardinality());
log.info("Rolling cardinality estimate off by %4.1f%%", 100 * (1 - rolling.estimateCardinality() / n));
Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05);
}
@Test
public void testFoldingByteBuffers()
{
final Random random = new Random(0);
final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000};
for (int numThings : numValsToCheck) {
HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < numThings; ++i) {
byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes();
allCombined.add(hashedVal);
if (i % 2 == 0) {
oneHalf.add(hashedVal);
} else {
otherHalf.add(hashedVal);
}
}
HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector();
folded.fold(oneHalf.toByteBuffer());
Assert.assertEquals(oneHalf, folded);
Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d);
folded.fold(otherHalf.toByteBuffer());
Assert.assertEquals(allCombined, folded);
Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d);
}
}
@Test
public void testFoldingReadOnlyByteBuffers()
{
final Random random = new Random(0);
final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000};
for (int numThings : numValsToCheck) {
HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < numThings; ++i) {
byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes();
allCombined.add(hashedVal);
if (i % 2 == 0) {
oneHalf.add(hashedVal);
} else {
otherHalf.add(hashedVal);
}
}
HyperLogLogCollector folded = HyperLogLogCollector.makeCollector(
ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray())
.asReadOnlyBuffer()
);
folded.fold(oneHalf.toByteBuffer());
Assert.assertEquals(oneHalf, folded);
Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d);
folded.fold(otherHalf.toByteBuffer());
Assert.assertEquals(allCombined, folded);
Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d);
}
}
@Test
public void testFoldingReadOnlyByteBuffersWithArbitraryPosition()
{
final Random random = new Random(0);
final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000};
for (int numThings : numValsToCheck) {
HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < numThings; ++i) {
byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes();
allCombined.add(hashedVal);
if (i % 2 == 0) {
oneHalf.add(hashedVal);
} else {
otherHalf.add(hashedVal);
}
}
HyperLogLogCollector folded = HyperLogLogCollector.makeCollector(
shiftedBuffer(
ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray())
.asReadOnlyBuffer(),
17
)
);
folded.fold(oneHalf.toByteBuffer());
Assert.assertEquals(oneHalf, folded);
Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d);
folded.fold(otherHalf.toByteBuffer());
Assert.assertEquals(allCombined, folded);
Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d);
}
}
@Test
public void testFoldWithDifferentOffsets1()
{
ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x00, 0x11);
ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00);
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(biggerOffset);
collector.fold(smallerOffset);
ByteBuffer outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 1);
Assert.assertEquals(outBuffer.getShort(), 2047);
outBuffer.get();
outBuffer.getShort();
Assert.assertEquals(outBuffer.get(), 0x10);
while (outBuffer.hasRemaining()) {
Assert.assertEquals(outBuffer.get(), 0x11);
}
collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(smallerOffset);
collector.fold(biggerOffset);
outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 1);
Assert.assertEquals(outBuffer.getShort(), 2047);
Assert.assertEquals(outBuffer.get(), 0);
Assert.assertEquals(outBuffer.getShort(), 0);
Assert.assertEquals(outBuffer.get(), 0x10);
while (outBuffer.hasRemaining()) {
Assert.assertEquals(outBuffer.get(), 0x11);
}
}
@Test
public void testBufferSwap()
{
ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x00, 0x11);
ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00);
ByteBuffer buffer = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage());
HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buffer.duplicate());
// make sure the original buffer gets modified
collector.fold(biggerOffset);
Assert.assertEquals(collector, HyperLogLogCollector.makeCollector(buffer.duplicate()));
// make sure the original buffer gets modified
collector.fold(smallerOffset);
Assert.assertEquals(collector, HyperLogLogCollector.makeCollector(buffer.duplicate()));
}
@Test
public void testFoldWithArbitraryInitialPositions()
{
ByteBuffer biggerOffset = shiftedBuffer(makeCollectorBuffer(1, (byte) 0x00, 0x11), 10);
ByteBuffer smallerOffset = shiftedBuffer(makeCollectorBuffer(0, (byte) 0x20, 0x00), 15);
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(biggerOffset);
collector.fold(smallerOffset);
ByteBuffer outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 1);
Assert.assertEquals(outBuffer.getShort(), 2047);
outBuffer.get();
outBuffer.getShort();
Assert.assertEquals(outBuffer.get(), 0x10);
while (outBuffer.hasRemaining()) {
Assert.assertEquals(outBuffer.get(), 0x11);
}
collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(smallerOffset);
collector.fold(biggerOffset);
outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 1);
Assert.assertEquals(outBuffer.getShort(), 2047);
outBuffer.get();
outBuffer.getShort();
Assert.assertEquals(outBuffer.get(), 0x10);
while (outBuffer.hasRemaining()) {
Assert.assertEquals(outBuffer.get(), 0x11);
}
}
protected ByteBuffer shiftedBuffer(ByteBuffer buf, int offset)
{
ByteBuffer shifted = ByteBuffer.allocate(buf.remaining() + offset);
shifted.position(offset);
shifted.put(buf);
shifted.position(offset);
return shifted;
}
@Test
public void testFoldWithDifferentOffsets2()
{
ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x01, 0x11);
ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00);
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(biggerOffset);
collector.fold(smallerOffset);
ByteBuffer outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 2);
Assert.assertEquals(outBuffer.getShort(), 0);
outBuffer.get();
outBuffer.getShort();
Assert.assertFalse(outBuffer.hasRemaining());
collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(smallerOffset);
collector.fold(biggerOffset);
outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 2);
Assert.assertEquals(outBuffer.getShort(), 0);
outBuffer.get();
outBuffer.getShort();
Assert.assertFalse(outBuffer.hasRemaining());
}
@Test
public void testFoldWithUpperNibbleTriggersOffsetChange()
{
byte[] arr1 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()];
Arrays.fill(arr1, (byte) 0x11);
ByteBuffer buffer1 = ByteBuffer.wrap(arr1);
buffer1.put(0, VersionOneHyperLogLogCollector.VERSION);
buffer1.put(1, (byte) 0);
buffer1.putShort(2, (short) (2047));
buffer1.put(VersionOneHyperLogLogCollector.HEADER_NUM_BYTES, (byte) 0x1);
byte[] arr2 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()];
Arrays.fill(arr2, (byte) 0x11);
ByteBuffer buffer2 = ByteBuffer.wrap(arr2);
buffer2.put(0, VersionOneHyperLogLogCollector.VERSION);
buffer2.put(1, (byte) 0);
buffer2.putShort(2, (short) (2048));
HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buffer1);
collector.fold(buffer2);
ByteBuffer outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), VersionOneHyperLogLogCollector.VERSION);
Assert.assertEquals(outBuffer.get(), 1);
Assert.assertEquals(outBuffer.getShort(), 0);
outBuffer.get();
outBuffer.getShort();
Assert.assertFalse(outBuffer.hasRemaining());
}
@Test
public void testSparseFoldWithDifferentOffsets1()
{
ByteBuffer biggerOffset = makeCollectorBuffer(1, new byte[]{0x11, 0x10}, 0x11);
ByteBuffer sparse = HyperLogLogCollector.makeCollector(makeCollectorBuffer(0, new byte[]{0x00, 0x02}, 0x00))
.toByteBuffer();
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(biggerOffset);
collector.fold(sparse);
ByteBuffer outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 2);
Assert.assertEquals(outBuffer.getShort(), 0);
Assert.assertEquals(outBuffer.get(), 0);
Assert.assertEquals(outBuffer.getShort(), 0);
Assert.assertFalse(outBuffer.hasRemaining());
collector = HyperLogLogCollector.makeLatestCollector();
collector.fold(sparse);
collector.fold(biggerOffset);
outBuffer = collector.toByteBuffer();
Assert.assertEquals(outBuffer.get(), collector.getVersion());
Assert.assertEquals(outBuffer.get(), 2);
Assert.assertEquals(outBuffer.getShort(), 0);
Assert.assertEquals(outBuffer.get(), 0);
Assert.assertEquals(outBuffer.getShort(), 0);
Assert.assertFalse(outBuffer.hasRemaining());
}
private ByteBuffer makeCollectorBuffer(int offset, byte initialBytes, int remainingBytes)
{
return makeCollectorBuffer(offset, new byte[]{initialBytes}, remainingBytes);
}
private ByteBuffer makeCollectorBuffer(int offset, byte[] initialBytes, int remainingBytes)
{
short numNonZero = 0;
for (byte initialByte : initialBytes) {
numNonZero += computeNumNonZero(initialByte);
}
final short numNonZeroInRemaining = computeNumNonZero((byte) remainingBytes);
numNonZero += (short) ((HyperLogLogCollector.NUM_BYTES_FOR_BUCKETS - initialBytes.length) * numNonZeroInRemaining);
ByteBuffer biggerOffset = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage());
biggerOffset.put(VersionOneHyperLogLogCollector.VERSION);
biggerOffset.put((byte) offset);
biggerOffset.putShort(numNonZero);
biggerOffset.put((byte) 0);
biggerOffset.putShort((short) 0);
biggerOffset.put(initialBytes);
while (biggerOffset.hasRemaining()) {
biggerOffset.put((byte) remainingBytes);
}
biggerOffset.clear();
return biggerOffset.asReadOnlyBuffer();
}
private short computeNumNonZero(byte theByte)
{
short retVal = 0;
if ((theByte & 0x0f) > 0) {
++retVal;
}
if ((theByte & 0xf0) > 0) {
++retVal;
}
return retVal;
}
@Ignore
@Test // This test can help when finding potential combinations that are weird, but it's non-deterministic
public void testFoldingwithDifferentOffsets()
{
// final Random random = new Random(37); // this seed will cause this test to fail because of slightly larger errors
final Random random = new Random(0);
for (int j = 0; j < 10; j++) {
HyperLogLogCollector smallVals = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector bigVals = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector();
int numThings = 500000;
for (int i = 0; i < numThings; i++) {
byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes();
if (i < 1000) {
smallVals.add(hashedVal);
} else {
bigVals.add(hashedVal);
}
all.add(hashedVal);
}
HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector();
folded.fold(smallVals);
folded.fold(bigVals);
final double expected = all.estimateCardinality();
Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025);
Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05);
}
}
@Ignore
@Test
public void testFoldingwithDifferentOffsets2() throws Exception
{
final Random random = new Random(0);
MessageDigest md = MessageDigest.getInstance("SHA-1");
for (int j = 0; j < 1; j++) {
HyperLogLogCollector evenVals = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector oddVals = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector();
int numThings = 500000;
for (int i = 0; i < numThings; i++) {
md.update(StringUtils.toUtf8(Integer.toString(random.nextInt())));
byte[] hashedVal = fn.hashBytes(md.digest()).asBytes();
if (i % 2 == 0) {
evenVals.add(hashedVal);
} else {
oddVals.add(hashedVal);
}
all.add(hashedVal);
}
HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector();
folded.fold(evenVals);
folded.fold(oddVals);
final double expected = all.estimateCardinality();
Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025);
Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05);
}
}
@Test
public void testEstimation()
{
Random random = new Random(0L);
final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000};
final double[] expectedVals = {
11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892,
981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434,
20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000,
1982408.2608981386
};
int valsToCheckIndex = 0;
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) {
collector.add(fn.hashLong(random.nextLong()).asBytes());
if (i == valsToCheck[valsToCheckIndex]) {
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
++valsToCheckIndex;
}
}
Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1);
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
}
@Test
public void testEstimationReadOnlyByteBuffers()
{
Random random = new Random(0L);
final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000};
final double[] expectedVals = {
11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892,
981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434,
20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000,
1982408.2608981386
};
int valsToCheckIndex = 0;
HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(
ByteBuffer.allocateDirect(
HyperLogLogCollector.getLatestNumBytesForDenseStorage()
)
);
for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) {
collector.add(fn.hashLong(random.nextLong()).asBytes());
if (i == valsToCheck[valsToCheckIndex]) {
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
++valsToCheckIndex;
}
}
Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1);
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
}
@Test
public void testEstimationLimitDifferentFromCapacity()
{
Random random = new Random(0L);
final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000};
final double[] expectedVals = {
11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892,
981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434,
20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000,
1982408.2608981386
};
int valsToCheckIndex = 0;
HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(
(ByteBuffer) ByteBuffer.allocate(10000)
.position(0)
.limit(HyperLogLogCollector.getLatestNumBytesForDenseStorage())
);
for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) {
collector.add(fn.hashLong(random.nextLong()).asBytes());
if (i == valsToCheck[valsToCheckIndex]) {
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
++valsToCheckIndex;
}
}
Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1);
Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d);
}
@Test
public void testSparseEstimation()
{
final Random random = new Random(0);
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < 100; ++i) {
collector.add(fn.hashLong(random.nextLong()).asBytes());
}
Assert.assertEquals(
collector.estimateCardinality(), HyperLogLogCollector.estimateByteBuffer(collector.toByteBuffer()), 0.0d
);
}
@Test
public void testHighBits()
{
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
// fill up all the buckets so we reach a registerOffset of 49
fillBuckets(collector, (byte) 0, (byte) 49);
// highest possible bit position is 64
collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
Assert.assertEquals(8.5089685793441677E17, collector.estimateCardinality(), 1000);
// this might happen once in a million years if you hash a billion values a second
fillBuckets(collector, (byte) 0, (byte) 63);
collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
Assert.assertEquals(Double.POSITIVE_INFINITY, collector.estimateCardinality(), 1000);
}
@Test
public void testMaxOverflow()
{
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.add((short) 23, (byte) 16);
Assert.assertEquals(23, collector.getMaxOverflowRegister());
Assert.assertEquals(16, collector.getMaxOverflowValue());
Assert.assertEquals(0, collector.getRegisterOffset());
Assert.assertEquals(0, collector.getNumNonZeroRegisters());
collector.add((short) 56, (byte) 17);
Assert.assertEquals(56, collector.getMaxOverflowRegister());
Assert.assertEquals(17, collector.getMaxOverflowValue());
collector.add((short) 43, (byte) 16);
Assert.assertEquals(56, collector.getMaxOverflowRegister());
Assert.assertEquals(17, collector.getMaxOverflowValue());
Assert.assertEquals(0, collector.getRegisterOffset());
Assert.assertEquals(0, collector.getNumNonZeroRegisters());
}
@Test
public void testRegisterSwapWithSparse()
{
final HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
// Skip the first bucket
for (int i = 1; i < HyperLogLogCollector.NUM_BUCKETS; i++) {
collector.add((short) i, (byte) 1);
Assert.assertEquals(i, collector.getNumNonZeroRegisters());
Assert.assertEquals(0, collector.getRegisterOffset());
}
Assert.assertEquals(
15615.219683654448D,
HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer())
.estimateCardinality(),
1e-5D
);
final byte[] hash = new byte[10];
hash[0] = 1; // Bucket 0, 1 offset of 0
collector.add(hash);
Assert.assertEquals(0, collector.getNumNonZeroRegisters());
Assert.assertEquals(1, collector.getRegisterOffset());
// We have a REALLY bad distribution, Sketch as 0 is fine.
Assert.assertEquals(
0.0D,
HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer())
.estimateCardinality(),
1e-5D
);
final ByteBuffer buffer = collector.toByteBuffer();
Assert.assertEquals(collector.getNumHeaderBytes(), buffer.remaining());
final HyperLogLogCollector denseCollector = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < HyperLogLogCollector.NUM_BUCKETS - 1; i++) {
denseCollector.add((short) i, (byte) 1);
}
Assert.assertEquals(HyperLogLogCollector.NUM_BUCKETS - 1, denseCollector.getNumNonZeroRegisters());
final HyperLogLogCollector folded = denseCollector.fold(HyperLogLogCollector.makeCollector(buffer));
Assert.assertNotNull(folded.toByteBuffer());
Assert.assertEquals(folded.getStorageBuffer().remaining(), denseCollector.getNumBytesForDenseStorage());
}
// Example of a terrible sampling filter. Don't use this method
@Test
public void testCanFillUpOnMod()
{
final HashFunction fn = Hashing.murmur3_128();
final HyperLogLogCollector hyperLogLogCollector = HyperLogLogCollector.makeLatestCollector();
final byte[] b = new byte[10];
b[0] = 1;
hyperLogLogCollector.add(b);
final Random random = new Random(347893248701078L);
long loops = 0;
// Do a 1% "sample" where the mod of the hash is 43
final Predicate<Integer> pass = i -> {
// ByteOrder.nativeOrder() on lots of systems is ByteOrder.LITTLE_ENDIAN
final ByteBuffer bb = ByteBuffer.wrap(fn.hashInt(i).asBytes()).order(ByteOrder.LITTLE_ENDIAN);
return (bb.getInt() % 100) == 43;
};
final long loopLimit = 1_000_000_000L;
do {
final int rnd = random.nextInt();
if (!pass.test(rnd)) {
continue;
}
final Hasher hasher = fn.newHasher();
hasher.putInt(rnd);
hyperLogLogCollector.add(hasher.hash().asBytes());
} while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && ++loops < loopLimit);
Assert.assertNotEquals(loopLimit, loops);
Assert.assertEquals(hyperLogLogCollector.getNumHeaderBytes(), hyperLogLogCollector.toByteBuffer().remaining());
}
@Test
public void testMergeMaxOverflow()
{
// no offset
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
collector.add((short) 23, (byte) 16);
HyperLogLogCollector other = HyperLogLogCollector.makeLatestCollector();
collector.add((short) 56, (byte) 17);
collector.fold(other);
Assert.assertEquals(56, collector.getMaxOverflowRegister());
Assert.assertEquals(17, collector.getMaxOverflowValue());
// different offsets
// fill up all the buckets so we reach a registerOffset of 49
collector = HyperLogLogCollector.makeLatestCollector();
fillBuckets(collector, (byte) 0, (byte) 49);
collector.add((short) 23, (byte) 65);
other = HyperLogLogCollector.makeLatestCollector();
fillBuckets(other, (byte) 0, (byte) 43);
other.add((short) 47, (byte) 67);
collector.fold(other);
Assert.assertEquals(47, collector.getMaxOverflowRegister());
Assert.assertEquals(67, collector.getMaxOverflowValue());
}
@Test
public void testFoldOrder()
{
final List<String> objects = Lists.newArrayList(
"AQcH/xYEMXOjRTVSQ1NXVENEM1RTUlVTRDI1aEVnhkOjNUaCI2MkU2VVhVNkNyVTa4NEYkS0kjZYU1RDdEYzUjglNTUzVFM0NkU3ZFUjOVJCdlU0N2QjRDRUV1MyZjNmVDOUM2RVVFRzhnUzVXY1R1RHUnNziURUdmREM0VjVEQmU0aEInZYNzNZNVRFgzVFNolSJHNIQ3QklEZlNSNoNTJXpDk1dFWjJGNYNiQzQkZFNEYzc1NVhSczM2NmJDZlc3JJRCpVNiRlNEI3dmU1ZGI0Q1RCMhNFZEJDZDYyNFOCM3U0VmRlVlNIRVQ4VVw1djNDVURHVSaFU0VEY0U1JFNIVCYlVEJWM2NWU0eURDOjQ6YyNTYkZjNUVjR1ZDdnVkMzVHZFpjMzlmNEFHM0dHJlRYTHSEQjVZVVZkVVIzIjg2SUU0NSM0VFNDNCdGVlQkhBNENCVTZGZEVlxFQyQ0NYWkUmVUJUYzRlNqg4NVVTNThEJkRGNDNUNFSEYmgkR0dDR1JldCNhVEZGRENGc1NDRUNER3WJRTRHQ4JlOYZoJDVVVVMzZSREZ1Q1UjSHNkdUMlU0ODIzZThSNmNDNjQ1o2I0YiRGYyZkNUJYVEMyN2QpQyMkc2VTE4U2VCNHZFRDNTh0IzI2VFNTMlUkNGMlKTRCIyR3QiQzFUNkRTdDM6RDRFI3VyVlcyWCUlQ0YjNjU2Q2dEVFNTRyRlI7VElHVTVVNGk0JHJTQzQkQyVlV0NCVlRkhWYkQ0RVaDNYdFZHWEWFJEYpM0QjNjNVUzNCVzVkgzZGFzQkRZUzN2U1dUFGVWZTUzVUREZDciZEVVYVNjeCU0ZDdEhzIpU2RTOFRUQkWlk1OFRUVTN1MkZSM3ZFc1VDNnUmc2NKNUaUIzd3M0RWxEZTsiNENLVHU0NFUmQ2RWRFdCNUVENFkxZCEnRLQkNEU0RVNmVDQjl9ZmNkM1QVM0MzQkUjJlVHRkNEVWlENDVUIlUvRkM0RVY1UzY6OGVHVCRDIzRUUlUjM2RDWSVkVIU1U1ZiVFNlNDhTN1VWNTVEZ2RzNzVDQlY0ZUNENUM5NUdkRDJGYzRCUzIjRGR4UmJFI4GDRTUiQ0ZUhVY1ZEYoZSRoVDYnREYkQ1SUU0RWUycjp2RZIySVZkUmZDREZVJGQyVEc1JElBZENEU2VEQlVUUnNDQziLRTNidmNjVCtjRFU2Q0SGYzVHVpGTNoVDxFVSMlWTJFQyRJdV1EI3RDloYyNFQ0c1NVY0ZHVEY0dkM2QkQyVDVUVTNFUyamMUdSNrNz0mlFlERzZTSGhFRjVGM3NWU2NINDI2U1RERUhjY4FHNWNTVTV1U0U2I0VXNEZERWNDNUSjI1WmMmQ4U=",
"AQgH+BUFUEUrZVRjM2IjMzJRESMlUnlTJjEjRhRlNBEyMSUpaGJTMjRCIzMTNCRENRdxNiNEZCQzNERYMiAyIiQmUTI+MhEzV1RWJoMjQjIySDN0QiYDUjUzNjRUVEYyQleDEiUmg0ERRjIjIzJUQjMxNlJGUTNDJFNTRzJiE1M0RjQzUzIiFDUmMjIzJWVCNENTIRJVODUzEkIVMhFEIjM0MkMyIRRCNFNxQyNCQ2UzOFQiJSM0EzU1V1M2EjhUVENDclZzImEiMTJBQlQiJCgyIyKkJSUlNBNDE2M3QSIyMicjMlJEUhJDJFQjJ0VSQ0QyYSFhZSNlQ4REUzVFIlOFRHIkYUJEM8RVMkMiMEczQwMlE1EkAlNiQlhCNkISRVI0ITUjRDU1JVNlK1QyGGRHQVM0NUVHQ1MkMyQoIzMzFCFUI0IhU1OIhCIlZUQVIUMyYzMlMUZ0RCKEIigUIlQ0QkQTM0MkM0QyJkUSM2I2tHJDUTQ0RBQ0YyNlUxUzIiIiMUiSMzUlJDNDQjM0ITQyNIM1MyNWM0MDOTZYVDRWIiZhMzc0NCJ0Q0NDZEMUElMyRyMmUhNiMkIZNjMkEyRTIzYkMzNUODUTNDJVM0ZTQjFCJCNWSTUlEiNCM1U2FCZUJzMVMyLjNkMhITVDEjIYMzNiVmIlO1VTMjMiVDQ2NTJFYyE0Q2IjRDN2IjRTRUVTFUVEYVKBVSMVJSFE0zOXNSJIqVElMVM4MiZEFSMhRlJEJUZnMycmQmQyJDl1JzVjMXQ0MzMjE1VUI1JDJUQyYRQ2JVZzQUJDM2IyInEkY1QiZTJEMRMiMxRVNEUjJUNkJHNSQiNCVCIyIjJUQlEhNUdFUhQzgkcSZaJUVUM0YiJEM2SjczUUIUIlQiM0RiQkIzZhRBJSRzQ0ZUI00UUSRSQlQmMkNINzODQhJFRTZ0FRQ3QTRhIzFTJFRBMmMzQzQhZENUMiIlV2VEMiNFRWQ1F1IyFXRSUyRTMqZ3I0YyhUNEJRMjISZRc2NDOEIjIxVGVWIXYyMiNCJBFDQSMhIzMjVFIDElgyJCUyVFgkRSQzIjJFQlNWRTQWMmQzFFOiMzVTZGMxNFZUNmIjRjETNUNURERTQjYVIkEzNEEyNDNTVUJSVzVkMjEyUlMjQ0RGgyFFNUQhRGMmRUQ2ZSOFETUYNlZCUhRiU2QhVUUiIlJDRjMhRVJDZxNSRTNBRCEoI0FGNUVRE0VFOGdCRDM2QkJCFSQhMxITRoE0VFIzVWUiUTNkRhNDMiMmIzRDQSNTFDoldaJDcnNjkSMJg3IkIiRENSQmciUhY2NFQ4RSNoJENkWDMmVCJGMxQjJGJScyNTJDVDNEEiZSMzQyIyVGRTNEIUw=",
"AQgH+hQAFyMzlFVXNCNlRxRUYlRUUUZCMnRFJiR0WTgyZiRJZzRFQkVTVVVWc2ZFMlY1QkIxYUQTI0JDY1YkNEVENGUuQTRiNkQ0VUEzNkKUKLSIVkUhNiZURnRFMzcjVEBTdjVVVCIzJDM0hjc0RDVlVjRqMjJVZTNSM0QmQyMTRlNzVCNERFQyMxNBZHMiUSdYIUUjNlVjNzRyYWFHRHI3hKMnYnhFNCZOdlNUZBM0Q0clNTVBiEQRMUQzNSNVQ0IkEmZYNzIyNkRSUik2VBOVRCRDg0IilEMlcjRJMkJDSjRCJURTVDJBMmRTVBM1YyRRMSQoRDV2YzRDVCUkQWFFNDYnQ0IkUzRjRkQ1dGI0VUYzRERCQ1I2dFNhREOUUjJDc0NTN0JFNUZJRGFpU1Q0QyJlNiMzNCZSKFQzYnNUWTMiRGMiRWdSQzMiQnQ0QSgjVUMiE0hRM1NVUiZVIlRkRVMzI2VkRjQWQ1YyRiZWNHQXQ0UllUMSVTJDQzMkWCQiRFglMzIzKEYzJSJFMyREVIQlVFFlYzMDQyVWUZNCQlM0NUJFIkWiNnREdEJDImNWJDOIcmKyQzc5VDVRQ3PVNjQzIkJTQ3FzMjMyRFVFVTUlNUZEMzEjI0Q0M0Y2U1JTREQjIhZScUJjQkYhFRJFQyI0pTVmFTVlMkJXNDI1U3dFZkR2U0NCVRQyRih0UkIhckRUY0ZHSG00EiJUdVIxVjVGNnUVZCxEQkNTQjQ0IkZDciIkODYxM1MzRZRHQxVEZHZWJFIzRRZjVDNBMzI1Q1FEhUMiI0NkJWJWJDJzYlQiRSQjRoRiRhJTIjNSRVJEM1MiYmUiNBkjFkczRWU1SURIJUVDRFQ0QyZCUlRENEImE2FDQxRjlEdTI3RSNEU3RGJyWDNVMTVJNDM1QkJFQmNWRXUlcxNEQzNTGCtDUlNDMzMzY2VlcUQlaUIyZVMzA3NFM1NDc0JjZDUkQiFDY3QUczQzUkVDQjMiUWQ0NEQyNRVTMRJFM2RUMZNSQkQ0MkIiUgGCUkRig1UiElQkdDJFJDciVGIxMjQzI1UlNlRTM1JkRDc+RSM0VFUzMjWCU0RDMxJyJVJGI1VTEUQyM1R0I1c0NFNTM3MhIlUkNFIlZGNURkVURyNIVCMyYzQmQjITRkVHQ2NINGQ0Y0UW0icyUzMydEVBJVJIJENkUjRVIjQSNVYnEzVYMzUmYzGVNFRiQk0iVTVCM0RjJSMyRWRSQkURJBR0M0NzhnRlM3IzQxMTRDJjM1UVUkJCNUQTVGQlEzN0VDMyM0MmO2QoQzNSVURhFEAkU2IldINHRUU00zNFJVQxUkZEcVMyJSJkQjKFNCNUOzIYJEHEUyKCQjJESSY=",
"AQcH/x0BbjQ2JTpUlkdFRERHVDRkWDU0RGR1ejRURHZ6IzUqdJN1M1VFQiNHI1NTI0J1VHOGZYVFVTRIRJVkVmUolWVShERjSDRVMlRlJDU2VFh3UmR1Mjg3K0M2SUY0Q0ZUspNiJEdZMmc3YkxGSERGOGdjgzNRVGM1Q1UnN0RHU1Y0WWUzRWVEJSRSeGQ0RlNFJVVJU3YoQkdEQ2M2MiVFUyJWRUVWNmRkM0NkVER2WXNkR0QlNGNEVlYzZSS4RDMyVEQ1ckRTM0ZoMlQ2tURGQ0OFQ0ZiY1ZFNEajdXEjVSI6ZWSjNHVRRTRVMldzUjm0NGU0dlhESFRDM0IzVCYkdjdlJJRFVDaHEzUkRmNWOEVXZTM0U0VkREdSUjRHVVViVCVFVUN0RDNDkl01VHMoNVQzYlZFZmNVVUNDQ1VjUiQ2NTV0UzZVModSNEY4Zpc2JjhjiFJVUGM0SHI0UzRTU1R2R0d3VENUZSQzRUZlY4d0aGNkhTQzWVZFZTZkJ2NEZaVDU1alJWpFJpRGRnIlZUU1ZUR2M1NzOkVEMzVjZERiVlRYSEkmU4RLM0RTQ2Q2RjM3RTNhdVVEQzRXJUZTRmM1OEZTYyJkRGRjZDVTlDhSMzdXQiU1RFUiIoRpVGlXIjY1UVVjc0RDJDNSM0NVJTNkRUU1U0lDdEVXY2NGVVNJVmJJRTREVVNiMyVIQ3U6O0U0M0MzZFVVIzJmNERWJaJjikIlRXk1hFQ2NEU0RUN4UzdENEsVgzZFVidXUnU2VRZFRUQmZmRERCQ0ZER2Q3YnZFNlVpJUkzZVREKFWEUzVVMzYzQzQhfTYzQ0IlI5UoV0RGJCVXSDkyZCRSU3ITUkNoYzJUMkYzhlVVRTNyaDNmQzRDVVRjNkVUhEJyRBR2JlOEREVUU0RjY4Nkc3ZERGUyVDNFZGNFOTY3U1OKNlkjQy1TVlRTQ0M1REU2QhgzUzUzOWlWQ1Z3RTQzIzc7RXVkI0M4NCNYRVRGNEZbhFEyVJI0R1OUZEQ3VUVEQlU1NkNYJEYzdSQ0ZSNGeEWIVVU3KEVFY1RZQ0JSNEJFNFMyM0UzN0hHNTQjMlRGNkiEMyVjRFNVRXNkZGM2M4hENCMnU1VWQjNFRkO2VmO1RndEVzWTQiiHQ0NzM2clM4NjQxpjQjZEVTNEpEdlREJzc3OjZnRlNFNWJVNFeDokNCRmQ5NURJVUZSJyRDRXikVURVITZDNGW0ITNEOUQ0RUklZDQjYjVENURDRCRmRDU1hCY2VTR0RGIzJSZzlSczdTFJJkRlZyU1M1JTdVhDYhVFczQ0hTRIc0RCNDdUJEQxNlZEQ2ZEUiJJRFU3YzVGRER0R2ZlNFOTU1MyRGI0RzMkQ2Q="
);
List<HyperLogLogCollector> collectors = Lists.transform(
objects,
s -> HyperLogLogCollector.makeCollector(ByteBuffer.wrap(StringUtils.decodeBase64String(s)))
);
Collection<List<HyperLogLogCollector>> permutations = Collections2.permutations(collectors);
for (List<HyperLogLogCollector> permutation : permutations) {
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
for (HyperLogLogCollector foldee : permutation) {
collector.fold(foldee);
}
Assert.assertEquals(29, collector.getMaxOverflowValue());
Assert.assertEquals(366, collector.getMaxOverflowRegister());
Assert.assertEquals(1.0429189446653817E7, collector.estimateCardinality(), 1);
}
}
// Provides a nice printout of error rates as a function of cardinality
@Ignore
@Test
public void showErrorRate()
{
HashFunction fn = Hashing.murmur3_128();
Random random = ThreadLocalRandom.current();
double error = 0.0d;
int count = 0;
final int[] valsToCheck = {
10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000, 10000000, Integer.MAX_VALUE
};
for (int numThings : valsToCheck) {
long startTime = System.currentTimeMillis();
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
for (int i = 0; i < numThings; ++i) {
if (i != 0 && i % 100000000 == 0) {
++count;
error = computeError(error, count, i, startTime, collector);
}
collector.add(fn.hashLong(random.nextLong()).asBytes());
}
++count;
error = computeError(error, count, numThings, startTime, collector);
}
}
private double computeError(double error, int count, int numThings, long startTime, HyperLogLogCollector collector)
{
final double estimatedValue = collector.estimateCardinality();
final double errorThisTime = Math.abs((double) numThings - estimatedValue) / numThings;
error += errorThisTime;
log.info(
"%,d ==? %,f in %,d millis. actual error[%,f%%], avg. error [%,f%%]",
numThings,
estimatedValue,
System.currentTimeMillis() - startTime,
100 * errorThisTime,
(error / count) * 100
);
return error;
}
}