blob: 0d2dd8e19886d026dfde60fbdb52c45eeb6f812d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.solr.SolrTestCase;
import org.apache.solr.util.LongIterator;
import org.junit.Test;
/**
* Tests {@link HLL} of type {@link HLLType#FULL}.
*/
public class FullHLLTest extends SolrTestCase {
// TODO union test
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the
* small range correction.
*/
@Test
public void smallRangeSmokeTest() {
final int log2m = 11;
final int m = (1 << log2m);
final int regwidth = 5;
// only one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
final long cardinality = hll.cardinality();
// Trivially true that small correction conditions hold: one register
// set implies zeroes exist, and estimator trivially smaller than 5m/2.
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
assertEquals(cardinality, expected);
}
// all but one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<(m - 1); i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
}
// Trivially true that small correction conditions hold: all but
// one register set implies a zero exists, and estimator trivially
// smaller than 5m/2 since it's alpha / ((m-1)/2)
final long cardinality = hll.cardinality();
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the
* uncorrected estimator
*/
@Test
public void normalRangeSmokeTest() {
final int log2m = 11;
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
final int m = (1 << log2m);
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
// all registers at 'medium' value
{
final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for uncorrected range
assertTrue(estimator <= Math.pow(2, l)/30);
assertTrue(estimator > (5 * m /(double)2));
final long expected = (long)Math.ceil(estimator);
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the large
* range correction.
*/
@Test
public void largeRangeSmokeTest() {
final int log2m = 12;
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
final int m = (1 << log2m);
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
{
final int registerValue = 31/*chosen to ensure large correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for large range
assertTrue(estimator > Math.pow(2,l)/30);
// Large range correction: -2^L * log(1 - E/2^L)
final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
assertEquals(cardinality, expected);
}
}
// ========================================================================
/**
* Tests the bounds on a register's value for a given raw input value.
*/
@Test
public void registerValueTest() {
final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
// register width 4 (the minimum size)
{ // scoped locally for sanity
final int regwidth = 4;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
// lower-bounds of the register
hll.addRaw(0x000000000000001L/*'j'=1*/);
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
hll.addRaw(0x0000000000000012L/*'j'=2*/);
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
hll.addRaw(0x0000000000000023L/*'j'=3*/);
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
hll.addRaw(0x0000000000000044L/*'j'=4*/);
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
hll.addRaw(0x0000000000000085L/*'j'=5*/);
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
// upper-bounds of the register
// NOTE: bear in mind that BitVector itself does ensure that
// overflow of a register is prevented
hll.addRaw(0x0000000000010006L/*'j'=6*/);
assertEquals(bitVector.getRegister(6/*'j'*/), 13);
hll.addRaw(0x0000000000020007L/*'j'=7*/);
assertEquals(bitVector.getRegister(7/*'j'*/), 14);
hll.addRaw(0x0000000000040008L/*'j'=8*/);
assertEquals(bitVector.getRegister(8/*'j'*/), 15);
hll.addRaw(0x0000000000080009L/*'j'=9*/);
assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
// sanity checks to ensure that no other bits above the lowest-set
// bit matters
// NOTE: same as case 'j = 6' above
hll.addRaw(0x000000000003000AL/*'j'=10*/);
assertEquals(bitVector.getRegister(10/*'j'*/), 13);
hll.addRaw(0x000000000011000BL/*'j'=11*/);
assertEquals(bitVector.getRegister(11/*'j'*/), 13);
}
// register width 5
{ // scoped locally for sanity
final int regwidth = 5;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
// lower-bounds of the register
hll.addRaw(0x0000000000000001L/*'j'=1*/);
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
hll.addRaw(0x0000000000000012L/*'j'=2*/);
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
hll.addRaw(0x0000000000000023L/*'j'=3*/);
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
hll.addRaw(0x0000000000000044L/*'j'=4*/);
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
hll.addRaw(0x0000000000000085L/*'j'=5*/);
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
// upper-bounds of the register
// NOTE: bear in mind that BitVector itself does ensure that
// overflow of a register is prevented
hll.addRaw(0x0000000100000006L/*'j'=6*/);
assertEquals(bitVector.getRegister(6/*'j'*/), 29);
hll.addRaw(0x0000000200000007L/*'j'=7*/);
assertEquals(bitVector.getRegister(7/*'j'*/), 30);
hll.addRaw(0x0000000400000008L/*'j'=8*/);
assertEquals(bitVector.getRegister(8/*'j'*/), 31);
hll.addRaw(0x0000000800000009L/*'j'=9*/);
assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
}
}
// ========================================================================
/**
* Tests {@link HLL#clear()}.
*/
@Test
public void clearTest() {
final int regwidth = 5;
final int log2m = 4/*16 registers per counter*/;
final int m = 1 << log2m;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
for(int i=0; i<m; i++)
bitVector.setRegister(i, i);
hll.clear();
for(int i=0; i<m; i++){
assertEquals(bitVector.getRegister(i), 0L/*default value of register*/);
}
}
// ========================================================================
// Serialization
/**
* Tests {@link HLL#toBytes(ISchemaVersion)} and {@link HLL#fromBytes(byte[])}.
*/
@Test
public void toFromBytesTest() {
final int log2m = 11/*arbitrary*/;
final int regwidth = 5;
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
final HLLType type = HLLType.FULL;
final int padding = schemaVersion.paddingBytes(type);
final int dataByteCount = ProbabilisticTestUtil.getRequiredBytes(regwidth, (1 << log2m)/*aka 2^log2m = m*/);
final int expectedByteCount = padding + dataByteCount;
{// Should work on an empty element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a partially filled element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<3; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a full set
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<(1 << log2m)/*aka 2^log2m*/; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
}
// ************************************************************************
// Assertion Helpers
/**
* Asserts that the two HLLs are register-wise equal.
*/
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
final BitVector bitVectorA = hllA.probabilisticStorage;
final BitVector bitVectorB = hllB.probabilisticStorage;
final LongIterator iterA = bitVectorA.registerIterator();
final LongIterator iterB = bitVectorB.registerIterator();
for(;iterA.hasNext() && iterB.hasNext();) {
assertEquals(iterA.next(), iterB.next());
}
assertFalse(iterA.hasNext());
assertFalse(iterB.hasNext());
}
}