| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.druid.hll; |
| |
| import com.google.common.collect.Collections2; |
| import com.google.common.collect.Lists; |
| import com.google.common.hash.HashFunction; |
| import com.google.common.hash.Hasher; |
| import com.google.common.hash.Hashing; |
| import org.apache.druid.java.util.common.StringUtils; |
| import org.apache.druid.java.util.common.logger.Logger; |
| import org.junit.Assert; |
| import org.junit.Ignore; |
| import org.junit.Test; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.ByteOrder; |
| import java.security.MessageDigest; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.List; |
| import java.util.Random; |
| import java.util.concurrent.ThreadLocalRandom; |
| import java.util.function.Predicate; |
| |
| /** |
| * |
| */ |
| public class HyperLogLogCollectorTest |
| { |
| private static final Logger log = new Logger(HyperLogLogCollectorTest.class); |
| |
| private final HashFunction fn = Hashing.murmur3_128(); |
| |
| private static void fillBuckets(HyperLogLogCollector collector, byte startOffset, byte endOffset) |
| { |
| byte offset = startOffset; |
| while (offset <= endOffset) { |
| // fill buckets to shift registerOffset |
| for (short bucket = 0; bucket < 2048; ++bucket) { |
| collector.add(bucket, offset); |
| } |
| offset++; |
| } |
| } |
| |
| @Test |
| public void testFolding() |
| { |
| final Random random = new Random(0); |
| final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; |
| for (int numThings : numValsToCheck) { |
| HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < numThings; ++i) { |
| byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); |
| |
| allCombined.add(hashedVal); |
| if (i % 2 == 0) { |
| oneHalf.add(hashedVal); |
| } else { |
| otherHalf.add(hashedVal); |
| } |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); |
| |
| folded.fold(oneHalf); |
| Assert.assertEquals(oneHalf, folded); |
| Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| |
| folded.fold(otherHalf); |
| Assert.assertEquals(allCombined, folded); |
| Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| } |
| } |
| |
| /** |
| * This is a very long running test, disabled by default. |
| * It is meant to catch issues when combining a large numer of HLL objects. |
| * |
| * It compares adding all the values to one HLL vs. |
| * splitting up values into HLLs of 100 values each, and folding those HLLs into a single main HLL. |
| * |
| * When reaching very large cardinalities (>> 50,000,000), offsets are mismatched between the main HLL and the ones |
| * with 100 values, requiring a floating max as described in |
| * https://druid.apache.org/blog/2014/02/18/hyperloglog-optimizations-for-real-world-systems.html |
| */ |
| @Ignore |
| @Test |
| public void testHighCardinalityRollingFold() throws Exception |
| { |
| final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector(); |
| final HyperLogLogCollector simple = HyperLogLogCollector.makeLatestCollector(); |
| |
| MessageDigest md = MessageDigest.getInstance("SHA-1"); |
| HyperLogLogCollector tmp = HyperLogLogCollector.makeLatestCollector(); |
| |
| int count; |
| for (count = 0; count < 100_000_000; ++count) { |
| md.update(StringUtils.toUtf8(Integer.toString(count))); |
| |
| byte[] hashed = fn.hashBytes(md.digest()).asBytes(); |
| |
| tmp.add(hashed); |
| simple.add(hashed); |
| |
| if (count % 100 == 0) { |
| rolling.fold(tmp); |
| tmp = HyperLogLogCollector.makeLatestCollector(); |
| } |
| } |
| |
| int n = count; |
| |
| log.info("True cardinality " + n); |
| log.info("Rolling buffer cardinality " + rolling.estimateCardinality()); |
| log.info("Simple buffer cardinality " + simple.estimateCardinality()); |
| log.info("Rolling cardinality estimate off by %4.1f%%", 100 * (1 - rolling.estimateCardinality() / n)); |
| |
| Assert.assertEquals(n, simple.estimateCardinality(), n * 0.05); |
| Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05); |
| } |
| |
| @Ignore |
| @Test |
| public void testHighCardinalityRollingFold2() |
| { |
| final HyperLogLogCollector rolling = HyperLogLogCollector.makeLatestCollector(); |
| int count; |
| long start = System.currentTimeMillis(); |
| |
| for (count = 0; count < 50_000_000; ++count) { |
| HyperLogLogCollector theCollector = HyperLogLogCollector.makeLatestCollector(); |
| theCollector.add(fn.hashLong(count).asBytes()); |
| rolling.fold(theCollector); |
| } |
| log.info("testHighCardinalityRollingFold2 took %d ms", System.currentTimeMillis() - start); |
| |
| int n = count; |
| |
| log.info("True cardinality " + n); |
| log.info("Rolling buffer cardinality " + rolling.estimateCardinality()); |
| log.info("Rolling cardinality estimate off by %4.1f%%", 100 * (1 - rolling.estimateCardinality() / n)); |
| |
| Assert.assertEquals(n, rolling.estimateCardinality(), n * 0.05); |
| } |
| |
| @Test |
| public void testFoldingByteBuffers() |
| { |
| final Random random = new Random(0); |
| final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; |
| for (int numThings : numValsToCheck) { |
| HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < numThings; ++i) { |
| byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); |
| |
| allCombined.add(hashedVal); |
| if (i % 2 == 0) { |
| oneHalf.add(hashedVal); |
| } else { |
| otherHalf.add(hashedVal); |
| } |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); |
| |
| folded.fold(oneHalf.toByteBuffer()); |
| Assert.assertEquals(oneHalf, folded); |
| Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| |
| folded.fold(otherHalf.toByteBuffer()); |
| Assert.assertEquals(allCombined, folded); |
| Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| } |
| } |
| |
| @Test |
| public void testFoldingReadOnlyByteBuffers() |
| { |
| final Random random = new Random(0); |
| final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; |
| for (int numThings : numValsToCheck) { |
| HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < numThings; ++i) { |
| byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); |
| |
| allCombined.add(hashedVal); |
| if (i % 2 == 0) { |
| oneHalf.add(hashedVal); |
| } else { |
| otherHalf.add(hashedVal); |
| } |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeCollector( |
| ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray()) |
| .asReadOnlyBuffer() |
| ); |
| |
| folded.fold(oneHalf.toByteBuffer()); |
| Assert.assertEquals(oneHalf, folded); |
| Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| |
| folded.fold(otherHalf.toByteBuffer()); |
| Assert.assertEquals(allCombined, folded); |
| Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| } |
| } |
| |
| @Test |
| public void testFoldingReadOnlyByteBuffersWithArbitraryPosition() |
| { |
| final Random random = new Random(0); |
| final int[] numValsToCheck = {10, 20, 50, 100, 1000, 2000}; |
| for (int numThings : numValsToCheck) { |
| HyperLogLogCollector allCombined = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector oneHalf = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector otherHalf = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < numThings; ++i) { |
| byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); |
| |
| allCombined.add(hashedVal); |
| if (i % 2 == 0) { |
| oneHalf.add(hashedVal); |
| } else { |
| otherHalf.add(hashedVal); |
| } |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeCollector( |
| shiftedBuffer( |
| ByteBuffer.wrap(HyperLogLogCollector.makeEmptyVersionedByteArray()) |
| .asReadOnlyBuffer(), |
| 17 |
| ) |
| ); |
| |
| folded.fold(oneHalf.toByteBuffer()); |
| Assert.assertEquals(oneHalf, folded); |
| Assert.assertEquals(oneHalf.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| |
| folded.fold(otherHalf.toByteBuffer()); |
| Assert.assertEquals(allCombined, folded); |
| Assert.assertEquals(allCombined.estimateCardinality(), folded.estimateCardinality(), 0.0d); |
| } |
| } |
| |
| @Test |
| public void testFoldWithDifferentOffsets1() |
| { |
| ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x00, 0x11); |
| ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00); |
| |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(biggerOffset); |
| collector.fold(smallerOffset); |
| |
| ByteBuffer outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 1); |
| Assert.assertEquals(outBuffer.getShort(), 2047); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertEquals(outBuffer.get(), 0x10); |
| while (outBuffer.hasRemaining()) { |
| Assert.assertEquals(outBuffer.get(), 0x11); |
| } |
| |
| collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(smallerOffset); |
| collector.fold(biggerOffset); |
| |
| outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 1); |
| Assert.assertEquals(outBuffer.getShort(), 2047); |
| Assert.assertEquals(outBuffer.get(), 0); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| Assert.assertEquals(outBuffer.get(), 0x10); |
| while (outBuffer.hasRemaining()) { |
| Assert.assertEquals(outBuffer.get(), 0x11); |
| } |
| } |
| |
| @Test |
| public void testBufferSwap() |
| { |
| ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x00, 0x11); |
| ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00); |
| |
| ByteBuffer buffer = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage()); |
| HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buffer.duplicate()); |
| |
| // make sure the original buffer gets modified |
| collector.fold(biggerOffset); |
| Assert.assertEquals(collector, HyperLogLogCollector.makeCollector(buffer.duplicate())); |
| |
| // make sure the original buffer gets modified |
| collector.fold(smallerOffset); |
| Assert.assertEquals(collector, HyperLogLogCollector.makeCollector(buffer.duplicate())); |
| } |
| |
| @Test |
| public void testFoldWithArbitraryInitialPositions() |
| { |
| ByteBuffer biggerOffset = shiftedBuffer(makeCollectorBuffer(1, (byte) 0x00, 0x11), 10); |
| ByteBuffer smallerOffset = shiftedBuffer(makeCollectorBuffer(0, (byte) 0x20, 0x00), 15); |
| |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(biggerOffset); |
| collector.fold(smallerOffset); |
| |
| ByteBuffer outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 1); |
| Assert.assertEquals(outBuffer.getShort(), 2047); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertEquals(outBuffer.get(), 0x10); |
| while (outBuffer.hasRemaining()) { |
| Assert.assertEquals(outBuffer.get(), 0x11); |
| } |
| |
| collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(smallerOffset); |
| collector.fold(biggerOffset); |
| |
| outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 1); |
| Assert.assertEquals(outBuffer.getShort(), 2047); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertEquals(outBuffer.get(), 0x10); |
| while (outBuffer.hasRemaining()) { |
| Assert.assertEquals(outBuffer.get(), 0x11); |
| } |
| } |
| |
| protected ByteBuffer shiftedBuffer(ByteBuffer buf, int offset) |
| { |
| ByteBuffer shifted = ByteBuffer.allocate(buf.remaining() + offset); |
| shifted.position(offset); |
| shifted.put(buf); |
| shifted.position(offset); |
| return shifted; |
| } |
| |
| @Test |
| public void testFoldWithDifferentOffsets2() |
| { |
| ByteBuffer biggerOffset = makeCollectorBuffer(1, (byte) 0x01, 0x11); |
| ByteBuffer smallerOffset = makeCollectorBuffer(0, (byte) 0x20, 0x00); |
| |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(biggerOffset); |
| collector.fold(smallerOffset); |
| |
| ByteBuffer outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 2); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertFalse(outBuffer.hasRemaining()); |
| |
| collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(smallerOffset); |
| collector.fold(biggerOffset); |
| |
| outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 2); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertFalse(outBuffer.hasRemaining()); |
| } |
| |
| @Test |
| public void testFoldWithUpperNibbleTriggersOffsetChange() |
| { |
| byte[] arr1 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()]; |
| Arrays.fill(arr1, (byte) 0x11); |
| ByteBuffer buffer1 = ByteBuffer.wrap(arr1); |
| buffer1.put(0, VersionOneHyperLogLogCollector.VERSION); |
| buffer1.put(1, (byte) 0); |
| buffer1.putShort(2, (short) (2047)); |
| buffer1.put(VersionOneHyperLogLogCollector.HEADER_NUM_BYTES, (byte) 0x1); |
| |
| byte[] arr2 = new byte[HyperLogLogCollector.getLatestNumBytesForDenseStorage()]; |
| Arrays.fill(arr2, (byte) 0x11); |
| ByteBuffer buffer2 = ByteBuffer.wrap(arr2); |
| buffer2.put(0, VersionOneHyperLogLogCollector.VERSION); |
| buffer2.put(1, (byte) 0); |
| buffer2.putShort(2, (short) (2048)); |
| |
| HyperLogLogCollector collector = HyperLogLogCollector.makeCollector(buffer1); |
| collector.fold(buffer2); |
| |
| ByteBuffer outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), VersionOneHyperLogLogCollector.VERSION); |
| Assert.assertEquals(outBuffer.get(), 1); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| outBuffer.get(); |
| outBuffer.getShort(); |
| Assert.assertFalse(outBuffer.hasRemaining()); |
| } |
| |
| @Test |
| public void testSparseFoldWithDifferentOffsets1() |
| { |
| ByteBuffer biggerOffset = makeCollectorBuffer(1, new byte[]{0x11, 0x10}, 0x11); |
| ByteBuffer sparse = HyperLogLogCollector.makeCollector(makeCollectorBuffer(0, new byte[]{0x00, 0x02}, 0x00)) |
| .toByteBuffer(); |
| |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(biggerOffset); |
| collector.fold(sparse); |
| |
| ByteBuffer outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 2); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| Assert.assertEquals(outBuffer.get(), 0); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| Assert.assertFalse(outBuffer.hasRemaining()); |
| |
| collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.fold(sparse); |
| collector.fold(biggerOffset); |
| |
| outBuffer = collector.toByteBuffer(); |
| |
| Assert.assertEquals(outBuffer.get(), collector.getVersion()); |
| Assert.assertEquals(outBuffer.get(), 2); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| Assert.assertEquals(outBuffer.get(), 0); |
| Assert.assertEquals(outBuffer.getShort(), 0); |
| Assert.assertFalse(outBuffer.hasRemaining()); |
| } |
| |
| private ByteBuffer makeCollectorBuffer(int offset, byte initialBytes, int remainingBytes) |
| { |
| return makeCollectorBuffer(offset, new byte[]{initialBytes}, remainingBytes); |
| } |
| |
| private ByteBuffer makeCollectorBuffer(int offset, byte[] initialBytes, int remainingBytes) |
| { |
| short numNonZero = 0; |
| for (byte initialByte : initialBytes) { |
| numNonZero += computeNumNonZero(initialByte); |
| } |
| |
| final short numNonZeroInRemaining = computeNumNonZero((byte) remainingBytes); |
| numNonZero += (short) ((HyperLogLogCollector.NUM_BYTES_FOR_BUCKETS - initialBytes.length) * numNonZeroInRemaining); |
| |
| ByteBuffer biggerOffset = ByteBuffer.allocate(HyperLogLogCollector.getLatestNumBytesForDenseStorage()); |
| biggerOffset.put(VersionOneHyperLogLogCollector.VERSION); |
| biggerOffset.put((byte) offset); |
| biggerOffset.putShort(numNonZero); |
| biggerOffset.put((byte) 0); |
| biggerOffset.putShort((short) 0); |
| biggerOffset.put(initialBytes); |
| while (biggerOffset.hasRemaining()) { |
| biggerOffset.put((byte) remainingBytes); |
| } |
| biggerOffset.clear(); |
| return biggerOffset.asReadOnlyBuffer(); |
| } |
| |
| private short computeNumNonZero(byte theByte) |
| { |
| short retVal = 0; |
| if ((theByte & 0x0f) > 0) { |
| ++retVal; |
| } |
| if ((theByte & 0xf0) > 0) { |
| ++retVal; |
| } |
| return retVal; |
| } |
| |
| @Ignore |
| @Test // This test can help when finding potential combinations that are weird, but it's non-deterministic |
| public void testFoldingwithDifferentOffsets() |
| { |
| // final Random random = new Random(37); // this seed will cause this test to fail because of slightly larger errors |
| final Random random = new Random(0); |
| for (int j = 0; j < 10; j++) { |
| HyperLogLogCollector smallVals = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector bigVals = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector(); |
| |
| int numThings = 500000; |
| for (int i = 0; i < numThings; i++) { |
| byte[] hashedVal = fn.hashLong(random.nextLong()).asBytes(); |
| |
| if (i < 1000) { |
| smallVals.add(hashedVal); |
| } else { |
| bigVals.add(hashedVal); |
| } |
| all.add(hashedVal); |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); |
| folded.fold(smallVals); |
| folded.fold(bigVals); |
| final double expected = all.estimateCardinality(); |
| Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025); |
| Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05); |
| } |
| } |
| |
| @Ignore |
| @Test |
| public void testFoldingwithDifferentOffsets2() throws Exception |
| { |
| final Random random = new Random(0); |
| MessageDigest md = MessageDigest.getInstance("SHA-1"); |
| |
| for (int j = 0; j < 1; j++) { |
| HyperLogLogCollector evenVals = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector oddVals = HyperLogLogCollector.makeLatestCollector(); |
| HyperLogLogCollector all = HyperLogLogCollector.makeLatestCollector(); |
| |
| int numThings = 500000; |
| for (int i = 0; i < numThings; i++) { |
| md.update(StringUtils.toUtf8(Integer.toString(random.nextInt()))); |
| byte[] hashedVal = fn.hashBytes(md.digest()).asBytes(); |
| |
| if (i % 2 == 0) { |
| evenVals.add(hashedVal); |
| } else { |
| oddVals.add(hashedVal); |
| } |
| all.add(hashedVal); |
| } |
| |
| HyperLogLogCollector folded = HyperLogLogCollector.makeLatestCollector(); |
| folded.fold(evenVals); |
| folded.fold(oddVals); |
| final double expected = all.estimateCardinality(); |
| Assert.assertEquals(expected, folded.estimateCardinality(), expected * 0.025); |
| Assert.assertEquals(numThings, folded.estimateCardinality(), numThings * 0.05); |
| } |
| } |
| |
| @Test |
| public void testEstimation() |
| { |
| Random random = new Random(0L); |
| |
| final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; |
| final double[] expectedVals = { |
| 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, |
| 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, |
| 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, |
| 1982408.2608981386 |
| }; |
| |
| int valsToCheckIndex = 0; |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { |
| collector.add(fn.hashLong(random.nextLong()).asBytes()); |
| if (i == valsToCheck[valsToCheckIndex]) { |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| ++valsToCheckIndex; |
| } |
| } |
| Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| } |
| |
| @Test |
| public void testEstimationReadOnlyByteBuffers() |
| { |
| Random random = new Random(0L); |
| |
| final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; |
| final double[] expectedVals = { |
| 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, |
| 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, |
| 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, |
| 1982408.2608981386 |
| }; |
| |
| int valsToCheckIndex = 0; |
| HyperLogLogCollector collector = HyperLogLogCollector.makeCollector( |
| ByteBuffer.allocateDirect( |
| HyperLogLogCollector.getLatestNumBytesForDenseStorage() |
| ) |
| ); |
| for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { |
| collector.add(fn.hashLong(random.nextLong()).asBytes()); |
| if (i == valsToCheck[valsToCheckIndex]) { |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| ++valsToCheckIndex; |
| } |
| } |
| Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| } |
| |
| @Test |
| public void testEstimationLimitDifferentFromCapacity() |
| { |
| Random random = new Random(0L); |
| |
| final int[] valsToCheck = {10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000}; |
| final double[] expectedVals = { |
| 11.029647221949576, 21.108407720752034, 51.64575281885815, 100.42231726408892, |
| 981.8579991802412, 1943.1337257462792, 4946.192042635218, 9935.088157579434, |
| 20366.1486889433, 49433.56029693898, 100615.26273314281, 980831.624899156000, |
| 1982408.2608981386 |
| }; |
| |
| int valsToCheckIndex = 0; |
| HyperLogLogCollector collector = HyperLogLogCollector.makeCollector( |
| (ByteBuffer) ByteBuffer.allocate(10000) |
| .position(0) |
| .limit(HyperLogLogCollector.getLatestNumBytesForDenseStorage()) |
| ); |
| for (int i = 0; i < valsToCheck[valsToCheck.length - 1]; ++i) { |
| collector.add(fn.hashLong(random.nextLong()).asBytes()); |
| if (i == valsToCheck[valsToCheckIndex]) { |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| ++valsToCheckIndex; |
| } |
| } |
| Assert.assertEquals(expectedVals.length, valsToCheckIndex + 1); |
| Assert.assertEquals(expectedVals[valsToCheckIndex], collector.estimateCardinality(), 0.0d); |
| } |
| |
| @Test |
| public void testSparseEstimation() |
| { |
| final Random random = new Random(0); |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < 100; ++i) { |
| collector.add(fn.hashLong(random.nextLong()).asBytes()); |
| } |
| |
| Assert.assertEquals( |
| collector.estimateCardinality(), HyperLogLogCollector.estimateByteBuffer(collector.toByteBuffer()), 0.0d |
| ); |
| } |
| |
| @Test |
| public void testHighBits() |
| { |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| |
| // fill up all the buckets so we reach a registerOffset of 49 |
| fillBuckets(collector, (byte) 0, (byte) 49); |
| |
| // highest possible bit position is 64 |
| collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); |
| Assert.assertEquals(8.5089685793441677E17, collector.estimateCardinality(), 1000); |
| |
| // this might happen once in a million years if you hash a billion values a second |
| fillBuckets(collector, (byte) 0, (byte) 63); |
| collector.add(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); |
| |
| Assert.assertEquals(Double.POSITIVE_INFINITY, collector.estimateCardinality(), 1000); |
| } |
| |
| @Test |
| public void testMaxOverflow() |
| { |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.add((short) 23, (byte) 16); |
| Assert.assertEquals(23, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(16, collector.getMaxOverflowValue()); |
| Assert.assertEquals(0, collector.getRegisterOffset()); |
| Assert.assertEquals(0, collector.getNumNonZeroRegisters()); |
| |
| collector.add((short) 56, (byte) 17); |
| Assert.assertEquals(56, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(17, collector.getMaxOverflowValue()); |
| |
| collector.add((short) 43, (byte) 16); |
| Assert.assertEquals(56, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(17, collector.getMaxOverflowValue()); |
| Assert.assertEquals(0, collector.getRegisterOffset()); |
| Assert.assertEquals(0, collector.getNumNonZeroRegisters()); |
| } |
| |
| @Test |
| public void testRegisterSwapWithSparse() |
| { |
| final HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| // Skip the first bucket |
| for (int i = 1; i < HyperLogLogCollector.NUM_BUCKETS; i++) { |
| collector.add((short) i, (byte) 1); |
| Assert.assertEquals(i, collector.getNumNonZeroRegisters()); |
| Assert.assertEquals(0, collector.getRegisterOffset()); |
| } |
| Assert.assertEquals( |
| 15615.219683654448D, |
| HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer()) |
| .estimateCardinality(), |
| 1e-5D |
| ); |
| |
| final byte[] hash = new byte[10]; |
| hash[0] = 1; // Bucket 0, 1 offset of 0 |
| collector.add(hash); |
| Assert.assertEquals(0, collector.getNumNonZeroRegisters()); |
| Assert.assertEquals(1, collector.getRegisterOffset()); |
| |
| // We have a REALLY bad distribution, Sketch as 0 is fine. |
| Assert.assertEquals( |
| 0.0D, |
| HyperLogLogCollector.makeCollector(collector.toByteBuffer().asReadOnlyBuffer()) |
| .estimateCardinality(), |
| 1e-5D |
| ); |
| final ByteBuffer buffer = collector.toByteBuffer(); |
| Assert.assertEquals(collector.getNumHeaderBytes(), buffer.remaining()); |
| |
| final HyperLogLogCollector denseCollector = HyperLogLogCollector.makeLatestCollector(); |
| for (int i = 0; i < HyperLogLogCollector.NUM_BUCKETS - 1; i++) { |
| denseCollector.add((short) i, (byte) 1); |
| } |
| |
| Assert.assertEquals(HyperLogLogCollector.NUM_BUCKETS - 1, denseCollector.getNumNonZeroRegisters()); |
| final HyperLogLogCollector folded = denseCollector.fold(HyperLogLogCollector.makeCollector(buffer)); |
| Assert.assertNotNull(folded.toByteBuffer()); |
| Assert.assertEquals(folded.getStorageBuffer().remaining(), denseCollector.getNumBytesForDenseStorage()); |
| } |
| |
| // Example of a terrible sampling filter. Don't use this method |
| @Test |
| public void testCanFillUpOnMod() |
| { |
| final HashFunction fn = Hashing.murmur3_128(); |
| final HyperLogLogCollector hyperLogLogCollector = HyperLogLogCollector.makeLatestCollector(); |
| final byte[] b = new byte[10]; |
| b[0] = 1; |
| hyperLogLogCollector.add(b); |
| final Random random = new Random(347893248701078L); |
| long loops = 0; |
| // Do a 1% "sample" where the mod of the hash is 43 |
| final Predicate<Integer> pass = i -> { |
| // ByteOrder.nativeOrder() on lots of systems is ByteOrder.LITTLE_ENDIAN |
| final ByteBuffer bb = ByteBuffer.wrap(fn.hashInt(i).asBytes()).order(ByteOrder.LITTLE_ENDIAN); |
| return (bb.getInt() % 100) == 43; |
| }; |
| final long loopLimit = 1_000_000_000L; |
| do { |
| final int rnd = random.nextInt(); |
| if (!pass.test(rnd)) { |
| continue; |
| } |
| final Hasher hasher = fn.newHasher(); |
| hasher.putInt(rnd); |
| hyperLogLogCollector.add(hasher.hash().asBytes()); |
| } while (hyperLogLogCollector.getNumNonZeroRegisters() > 0 && ++loops < loopLimit); |
| Assert.assertNotEquals(loopLimit, loops); |
| Assert.assertEquals(hyperLogLogCollector.getNumHeaderBytes(), hyperLogLogCollector.toByteBuffer().remaining()); |
| } |
| |
| @Test |
| public void testMergeMaxOverflow() |
| { |
| // no offset |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| collector.add((short) 23, (byte) 16); |
| |
| HyperLogLogCollector other = HyperLogLogCollector.makeLatestCollector(); |
| collector.add((short) 56, (byte) 17); |
| |
| collector.fold(other); |
| Assert.assertEquals(56, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(17, collector.getMaxOverflowValue()); |
| |
| // different offsets |
| // fill up all the buckets so we reach a registerOffset of 49 |
| collector = HyperLogLogCollector.makeLatestCollector(); |
| fillBuckets(collector, (byte) 0, (byte) 49); |
| collector.add((short) 23, (byte) 65); |
| |
| other = HyperLogLogCollector.makeLatestCollector(); |
| fillBuckets(other, (byte) 0, (byte) 43); |
| other.add((short) 47, (byte) 67); |
| |
| collector.fold(other); |
| Assert.assertEquals(47, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(67, collector.getMaxOverflowValue()); |
| } |
| |
| @Test |
| public void testFoldOrder() |
| { |
| final List<String> objects = Lists.newArrayList( |
| "AQcH/xYEMXOjRTVSQ1NXVENEM1RTUlVTRDI1aEVnhkOjNUaCI2MkU2VVhVNkNyVTa4NEYkS0kjZYU1RDdEYzUjglNTUzVFM0NkU3ZFUjOVJCdlU0N2QjRDRUV1MyZjNmVDOUM2RVVFRzhnUzVXY1R1RHUnNziURUdmREM0VjVEQmU0aEInZYNzNZNVRFgzVFNolSJHNIQ3QklEZlNSNoNTJXpDk1dFWjJGNYNiQzQkZFNEYzc1NVhSczM2NmJDZlc3JJRCpVNiRlNEI3dmU1ZGI0Q1RCMhNFZEJDZDYyNFOCM3U0VmRlVlNIRVQ4VVw1djNDVURHVSaFU0VEY0U1JFNIVCYlVEJWM2NWU0eURDOjQ6YyNTYkZjNUVjR1ZDdnVkMzVHZFpjMzlmNEFHM0dHJlRYTHSEQjVZVVZkVVIzIjg2SUU0NSM0VFNDNCdGVlQkhBNENCVTZGZEVlxFQyQ0NYWkUmVUJUYzRlNqg4NVVTNThEJkRGNDNUNFSEYmgkR0dDR1JldCNhVEZGRENGc1NDRUNER3WJRTRHQ4JlOYZoJDVVVVMzZSREZ1Q1UjSHNkdUMlU0ODIzZThSNmNDNjQ1o2I0YiRGYyZkNUJYVEMyN2QpQyMkc2VTE4U2VCNHZFRDNTh0IzI2VFNTMlUkNGMlKTRCIyR3QiQzFUNkRTdDM6RDRFI3VyVlcyWCUlQ0YjNjU2Q2dEVFNTRyRlI7VElHVTVVNGk0JHJTQzQkQyVlV0NCVlRkhWYkQ0RVaDNYdFZHWEWFJEYpM0QjNjNVUzNCVzVkgzZGFzQkRZUzN2U1dUFGVWZTUzVUREZDciZEVVYVNjeCU0ZDdEhzIpU2RTOFRUQkWlk1OFRUVTN1MkZSM3ZFc1VDNnUmc2NKNUaUIzd3M0RWxEZTsiNENLVHU0NFUmQ2RWRFdCNUVENFkxZCEnRLQkNEU0RVNmVDQjl9ZmNkM1QVM0MzQkUjJlVHRkNEVWlENDVUIlUvRkM0RVY1UzY6OGVHVCRDIzRUUlUjM2RDWSVkVIU1U1ZiVFNlNDhTN1VWNTVEZ2RzNzVDQlY0ZUNENUM5NUdkRDJGYzRCUzIjRGR4UmJFI4GDRTUiQ0ZUhVY1ZEYoZSRoVDYnREYkQ1SUU0RWUycjp2RZIySVZkUmZDREZVJGQyVEc1JElBZENEU2VEQlVUUnNDQziLRTNidmNjVCtjRFU2Q0SGYzVHVpGTNoVDxFVSMlWTJFQyRJdV1EI3RDloYyNFQ0c1NVY0ZHVEY0dkM2QkQyVDVUVTNFUyamMUdSNrNz0mlFlERzZTSGhFRjVGM3NWU2NINDI2U1RERUhjY4FHNWNTVTV1U0U2I0VXNEZERWNDNUSjI1WmMmQ4U=", |
| "AQgH+BUFUEUrZVRjM2IjMzJRESMlUnlTJjEjRhRlNBEyMSUpaGJTMjRCIzMTNCRENRdxNiNEZCQzNERYMiAyIiQmUTI+MhEzV1RWJoMjQjIySDN0QiYDUjUzNjRUVEYyQleDEiUmg0ERRjIjIzJUQjMxNlJGUTNDJFNTRzJiE1M0RjQzUzIiFDUmMjIzJWVCNENTIRJVODUzEkIVMhFEIjM0MkMyIRRCNFNxQyNCQ2UzOFQiJSM0EzU1V1M2EjhUVENDclZzImEiMTJBQlQiJCgyIyKkJSUlNBNDE2M3QSIyMicjMlJEUhJDJFQjJ0VSQ0QyYSFhZSNlQ4REUzVFIlOFRHIkYUJEM8RVMkMiMEczQwMlE1EkAlNiQlhCNkISRVI0ITUjRDU1JVNlK1QyGGRHQVM0NUVHQ1MkMyQoIzMzFCFUI0IhU1OIhCIlZUQVIUMyYzMlMUZ0RCKEIigUIlQ0QkQTM0MkM0QyJkUSM2I2tHJDUTQ0RBQ0YyNlUxUzIiIiMUiSMzUlJDNDQjM0ITQyNIM1MyNWM0MDOTZYVDRWIiZhMzc0NCJ0Q0NDZEMUElMyRyMmUhNiMkIZNjMkEyRTIzYkMzNUODUTNDJVM0ZTQjFCJCNWSTUlEiNCM1U2FCZUJzMVMyLjNkMhITVDEjIYMzNiVmIlO1VTMjMiVDQ2NTJFYyE0Q2IjRDN2IjRTRUVTFUVEYVKBVSMVJSFE0zOXNSJIqVElMVM4MiZEFSMhRlJEJUZnMycmQmQyJDl1JzVjMXQ0MzMjE1VUI1JDJUQyYRQ2JVZzQUJDM2IyInEkY1QiZTJEMRMiMxRVNEUjJUNkJHNSQiNCVCIyIjJUQlEhNUdFUhQzgkcSZaJUVUM0YiJEM2SjczUUIUIlQiM0RiQkIzZhRBJSRzQ0ZUI00UUSRSQlQmMkNINzODQhJFRTZ0FRQ3QTRhIzFTJFRBMmMzQzQhZENUMiIlV2VEMiNFRWQ1F1IyFXRSUyRTMqZ3I0YyhUNEJRMjISZRc2NDOEIjIxVGVWIXYyMiNCJBFDQSMhIzMjVFIDElgyJCUyVFgkRSQzIjJFQlNWRTQWMmQzFFOiMzVTZGMxNFZUNmIjRjETNUNURERTQjYVIkEzNEEyNDNTVUJSVzVkMjEyUlMjQ0RGgyFFNUQhRGMmRUQ2ZSOFETUYNlZCUhRiU2QhVUUiIlJDRjMhRVJDZxNSRTNBRCEoI0FGNUVRE0VFOGdCRDM2QkJCFSQhMxITRoE0VFIzVWUiUTNkRhNDMiMmIzRDQSNTFDoldaJDcnNjkSMJg3IkIiRENSQmciUhY2NFQ4RSNoJENkWDMmVCJGMxQjJGJScyNTJDVDNEEiZSMzQyIyVGRTNEIUw=", |
| "AQgH+hQAFyMzlFVXNCNlRxRUYlRUUUZCMnRFJiR0WTgyZiRJZzRFQkVTVVVWc2ZFMlY1QkIxYUQTI0JDY1YkNEVENGUuQTRiNkQ0VUEzNkKUKLSIVkUhNiZURnRFMzcjVEBTdjVVVCIzJDM0hjc0RDVlVjRqMjJVZTNSM0QmQyMTRlNzVCNERFQyMxNBZHMiUSdYIUUjNlVjNzRyYWFHRHI3hKMnYnhFNCZOdlNUZBM0Q0clNTVBiEQRMUQzNSNVQ0IkEmZYNzIyNkRSUik2VBOVRCRDg0IilEMlcjRJMkJDSjRCJURTVDJBMmRTVBM1YyRRMSQoRDV2YzRDVCUkQWFFNDYnQ0IkUzRjRkQ1dGI0VUYzRERCQ1I2dFNhREOUUjJDc0NTN0JFNUZJRGFpU1Q0QyJlNiMzNCZSKFQzYnNUWTMiRGMiRWdSQzMiQnQ0QSgjVUMiE0hRM1NVUiZVIlRkRVMzI2VkRjQWQ1YyRiZWNHQXQ0UllUMSVTJDQzMkWCQiRFglMzIzKEYzJSJFMyREVIQlVFFlYzMDQyVWUZNCQlM0NUJFIkWiNnREdEJDImNWJDOIcmKyQzc5VDVRQ3PVNjQzIkJTQ3FzMjMyRFVFVTUlNUZEMzEjI0Q0M0Y2U1JTREQjIhZScUJjQkYhFRJFQyI0pTVmFTVlMkJXNDI1U3dFZkR2U0NCVRQyRih0UkIhckRUY0ZHSG00EiJUdVIxVjVGNnUVZCxEQkNTQjQ0IkZDciIkODYxM1MzRZRHQxVEZHZWJFIzRRZjVDNBMzI1Q1FEhUMiI0NkJWJWJDJzYlQiRSQjRoRiRhJTIjNSRVJEM1MiYmUiNBkjFkczRWU1SURIJUVDRFQ0QyZCUlRENEImE2FDQxRjlEdTI3RSNEU3RGJyWDNVMTVJNDM1QkJFQmNWRXUlcxNEQzNTGCtDUlNDMzMzY2VlcUQlaUIyZVMzA3NFM1NDc0JjZDUkQiFDY3QUczQzUkVDQjMiUWQ0NEQyNRVTMRJFM2RUMZNSQkQ0MkIiUgGCUkRig1UiElQkdDJFJDciVGIxMjQzI1UlNlRTM1JkRDc+RSM0VFUzMjWCU0RDMxJyJVJGI1VTEUQyM1R0I1c0NFNTM3MhIlUkNFIlZGNURkVURyNIVCMyYzQmQjITRkVHQ2NINGQ0Y0UW0icyUzMydEVBJVJIJENkUjRVIjQSNVYnEzVYMzUmYzGVNFRiQk0iVTVCM0RjJSMyRWRSQkURJBR0M0NzhnRlM3IzQxMTRDJjM1UVUkJCNUQTVGQlEzN0VDMyM0MmO2QoQzNSVURhFEAkU2IldINHRUU00zNFJVQxUkZEcVMyJSJkQjKFNCNUOzIYJEHEUyKCQjJESSY=", |
| "AQcH/x0BbjQ2JTpUlkdFRERHVDRkWDU0RGR1ejRURHZ6IzUqdJN1M1VFQiNHI1NTI0J1VHOGZYVFVTRIRJVkVmUolWVShERjSDRVMlRlJDU2VFh3UmR1Mjg3K0M2SUY0Q0ZUspNiJEdZMmc3YkxGSERGOGdjgzNRVGM1Q1UnN0RHU1Y0WWUzRWVEJSRSeGQ0RlNFJVVJU3YoQkdEQ2M2MiVFUyJWRUVWNmRkM0NkVER2WXNkR0QlNGNEVlYzZSS4RDMyVEQ1ckRTM0ZoMlQ2tURGQ0OFQ0ZiY1ZFNEajdXEjVSI6ZWSjNHVRRTRVMldzUjm0NGU0dlhESFRDM0IzVCYkdjdlJJRFVDaHEzUkRmNWOEVXZTM0U0VkREdSUjRHVVViVCVFVUN0RDNDkl01VHMoNVQzYlZFZmNVVUNDQ1VjUiQ2NTV0UzZVModSNEY4Zpc2JjhjiFJVUGM0SHI0UzRTU1R2R0d3VENUZSQzRUZlY4d0aGNkhTQzWVZFZTZkJ2NEZaVDU1alJWpFJpRGRnIlZUU1ZUR2M1NzOkVEMzVjZERiVlRYSEkmU4RLM0RTQ2Q2RjM3RTNhdVVEQzRXJUZTRmM1OEZTYyJkRGRjZDVTlDhSMzdXQiU1RFUiIoRpVGlXIjY1UVVjc0RDJDNSM0NVJTNkRUU1U0lDdEVXY2NGVVNJVmJJRTREVVNiMyVIQ3U6O0U0M0MzZFVVIzJmNERWJaJjikIlRXk1hFQ2NEU0RUN4UzdENEsVgzZFVidXUnU2VRZFRUQmZmRERCQ0ZER2Q3YnZFNlVpJUkzZVREKFWEUzVVMzYzQzQhfTYzQ0IlI5UoV0RGJCVXSDkyZCRSU3ITUkNoYzJUMkYzhlVVRTNyaDNmQzRDVVRjNkVUhEJyRBR2JlOEREVUU0RjY4Nkc3ZERGUyVDNFZGNFOTY3U1OKNlkjQy1TVlRTQ0M1REU2QhgzUzUzOWlWQ1Z3RTQzIzc7RXVkI0M4NCNYRVRGNEZbhFEyVJI0R1OUZEQ3VUVEQlU1NkNYJEYzdSQ0ZSNGeEWIVVU3KEVFY1RZQ0JSNEJFNFMyM0UzN0hHNTQjMlRGNkiEMyVjRFNVRXNkZGM2M4hENCMnU1VWQjNFRkO2VmO1RndEVzWTQiiHQ0NzM2clM4NjQxpjQjZEVTNEpEdlREJzc3OjZnRlNFNWJVNFeDokNCRmQ5NURJVUZSJyRDRXikVURVITZDNGW0ITNEOUQ0RUklZDQjYjVENURDRCRmRDU1hCY2VTR0RGIzJSZzlSczdTFJJkRlZyU1M1JTdVhDYhVFczQ0hTRIc0RCNDdUJEQxNlZEQ2ZEUiJJRFU3YzVGRER0R2ZlNFOTU1MyRGI0RzMkQ2Q=" |
| ); |
| |
| List<HyperLogLogCollector> collectors = Lists.transform( |
| objects, |
| s -> HyperLogLogCollector.makeCollector(ByteBuffer.wrap(StringUtils.decodeBase64String(s))) |
| ); |
| |
| Collection<List<HyperLogLogCollector>> permutations = Collections2.permutations(collectors); |
| |
| for (List<HyperLogLogCollector> permutation : permutations) { |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (HyperLogLogCollector foldee : permutation) { |
| collector.fold(foldee); |
| } |
| Assert.assertEquals(29, collector.getMaxOverflowValue()); |
| Assert.assertEquals(366, collector.getMaxOverflowRegister()); |
| Assert.assertEquals(1.0429189446653817E7, collector.estimateCardinality(), 1); |
| } |
| } |
| |
| // Provides a nice printout of error rates as a function of cardinality |
| @Ignore |
| @Test |
| public void showErrorRate() |
| { |
| HashFunction fn = Hashing.murmur3_128(); |
| Random random = ThreadLocalRandom.current(); |
| |
| double error = 0.0d; |
| int count = 0; |
| |
| final int[] valsToCheck = { |
| 10, 20, 50, 100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 1000000, 2000000, 10000000, Integer.MAX_VALUE |
| }; |
| |
| for (int numThings : valsToCheck) { |
| long startTime = System.currentTimeMillis(); |
| HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector(); |
| |
| for (int i = 0; i < numThings; ++i) { |
| if (i != 0 && i % 100000000 == 0) { |
| ++count; |
| error = computeError(error, count, i, startTime, collector); |
| } |
| collector.add(fn.hashLong(random.nextLong()).asBytes()); |
| } |
| |
| ++count; |
| error = computeError(error, count, numThings, startTime, collector); |
| } |
| } |
| |
| private double computeError(double error, int count, int numThings, long startTime, HyperLogLogCollector collector) |
| { |
| final double estimatedValue = collector.estimateCardinality(); |
| final double errorThisTime = Math.abs((double) numThings - estimatedValue) / numThings; |
| |
| error += errorThisTime; |
| |
| log.info( |
| "%,d ==? %,f in %,d millis. actual error[%,f%%], avg. error [%,f%%]", |
| numThings, |
| estimatedValue, |
| System.currentTimeMillis() - startTime, |
| 100 * errorThisTime, |
| (error / count) * 100 |
| ); |
| return error; |
| } |
| } |