Merging with trunk.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr7787@1691514 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index 90d27ae..60aba7e 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -73,7 +73,6 @@
/hsqldb/hsqldb = 1.8.0.10
/io.airlift/slice = 0.10
/io.netty/netty = 3.7.0.Final
-/it.unimi.dsi/fastutil = 6.5.11
/jakarta-regexp/jakarta-regexp = 1.4
/javax.activation/activation = 1.1.1
/javax.inject/javax.inject= 1
@@ -85,7 +84,6 @@
/log4j/log4j = 1.2.17
/mecab/mecab-ipadic = 2.7.0-20070801
/mecab/mecab-naist-jdic = 0.6.3b-20111013
-/net.agkn/hll = 1.6.0
/net.arnx/jsonic = 1.2.7
/net.sf.ehcache/ehcache-core = 2.4.4
/net.sf.saxon/Saxon-HE = 9.6.0-2
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 2faa3c5..a1cc649 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -260,6 +260,9 @@
Other Changes
----------------------
+* SOLR-7787: Removed fastutil and java-hll dependency, integrated HyperLogLog from java-hll
+ into Solr core. (Dawid Weiss)
+
* SOLR-7595: Allow method chaining for all CollectionAdminRequests in Solrj. (shalin)
* SOLR-7146: MiniSolrCloudCluster based tests can fail with ZooKeeperException NoNode for /live_nodes.
diff --git a/solr/NOTICE.txt b/solr/NOTICE.txt
index a0eefed..25e2aa3 100644
--- a/solr/NOTICE.txt
+++ b/solr/NOTICE.txt
@@ -13,6 +13,9 @@
- Apache Blur
- Apache Hadoop
+This product includes code forked from the Java-HLL library.
+Copyright (c) 2013 Aggregate Knowledge, Inc., https://github.com/aggregateknowledge/java-hll/
+
This product includes the JQuery JavaScript library created by John Resig.
Copyright (c) 2010 John Resig, http://jquery.com/
diff --git a/solr/core/ivy.xml b/solr/core/ivy.xml
index 912d5c2..1495e46 100644
--- a/solr/core/ivy.xml
+++ b/solr/core/ivy.xml
@@ -134,10 +134,6 @@
<dependency org="org.antlr" name="antlr4-runtime" rev="${/org.antlr/antlr4-runtime}"/>
<dependency org="io.airlift" name="slice" rev="${/io.airlift/slice}"/>
- <!-- StatsComponents HLL Dependencies-->
- <dependency org="net.agkn" name="hll" rev="${/net.agkn/hll}" conf="compile->*"/>
- <dependency org="it.unimi.dsi" name="fastutil" rev="${/it.unimi.dsi/fastutil}" conf="compile->*"/>
-
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
diff --git a/solr/core/src/java/org/apache/solr/handler/component/StatsField.java b/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
index 9a26dc1..c5f70f4 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/StatsField.java
@@ -55,9 +55,9 @@
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;
@@ -625,8 +625,8 @@
* Creates an HllOptions based on the (local) params specified (if appropriate).
*
* @param localParams the LocalParams for this {@link StatsField}
- * @param field the field corrisponding to this {@link StatsField}, may be null if these stats are over a value source
- * @return the {@link HllOptions} to use basd on the params, or null if no {@link HLL} should be computed
+ * @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
+ * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
* @throws SolrException if there are invalid options
*/
public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
diff --git a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
index 6005f40..1a53c71 100644
--- a/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
+++ b/solr/core/src/java/org/apache/solr/handler/component/StatsValuesFactory.java
@@ -33,12 +33,12 @@
import org.apache.solr.schema.*;
import com.tdunning.math.stats.AVLTreeDigest;
-
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
+
/**
* Factory class for creating instance of
* {@link org.apache.solr.handler.component.StatsValues}
diff --git a/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java b/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java
index 758edb0..91a2133 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/HLLAgg.java
@@ -23,8 +23,8 @@
import java.util.List;
import java.util.Set;
-import net.agkn.hll.HLL;
-import net.agkn.hll.HLLType;
+import org.apache.solr.util.hll.HLL;
+import org.apache.solr.util.hll.HLLType;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java
index b3f215b..ca7d9b8 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueSlotAcc.java
@@ -21,7 +21,7 @@
import java.util.ArrayList;
import java.util.List;
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.SortedDocValues;
diff --git a/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java b/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
new file mode 100644
index 0000000..3245d1b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordDeserializer.java
@@ -0,0 +1,173 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A corresponding deserializer for {@link BigEndianAscendingWordSerializer}.
+ */
+class BigEndianAscendingWordDeserializer implements IWordDeserializer {
+ // The number of bits per byte.
+ private static final int BITS_PER_BYTE = 8;
+
+ // long mask for the maximum value stored in a byte
+ private static final long BYTE_MASK = (1L << BITS_PER_BYTE) - 1L;
+
+ // ************************************************************************
+ // The length in bits of the words to be read.
+ private final int wordLength;
+
+ // The byte array to which the words are serialized.
+ private final byte[] bytes;
+
+ // The number of leading padding bytes in 'bytes' to be ignored.
+ private final int bytePadding;
+
+ // The number of words that the byte array contains.
+ private final int wordCount;
+
+ // The current read state.
+ private int currentWordIndex;
+
+ // ========================================================================
+ /**
+ * @param wordLength the length in bits of the words to be deserialized. Must
+ * be less than or equal to 64 and greater than or equal to 1.
+ * @param bytePadding the number of leading bytes that pad the serialized words.
+ * Must be greater than or equal to zero.
+ * @param bytes the byte array containing the serialized words. Cannot be
+ * <code>null</code>.
+ */
+ public BigEndianAscendingWordDeserializer(final int wordLength, final int bytePadding, final byte[] bytes) {
+ if((wordLength < 1) || (wordLength > 64)) {
+ throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
+ }
+
+ if(bytePadding < 0) {
+ throw new IllegalArgumentException("Byte padding must be >= zero. (was: " + bytePadding + ")");
+ }
+
+ this.wordLength = wordLength;
+ this.bytes = bytes;
+ this.bytePadding = bytePadding;
+
+ final int dataBytes = (bytes.length - bytePadding);
+ final long dataBits = (dataBytes * BITS_PER_BYTE);
+
+ this.wordCount = (int)(dataBits/wordLength);
+
+ currentWordIndex = 0;
+ }
+
+ // ========================================================================
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordDeserializer#readWord()
+ */
+ @Override
+ public long readWord() {
+ final long word = readWord(currentWordIndex);
+ currentWordIndex++;
+
+ return word;
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Reads the word at the specified sequence position (zero-indexed).
+ *
+ * @param position the zero-indexed position of the word to be read. This
+ * must be greater than or equal to zero.
+ * @return the value of the serialized word at the specified position.
+ */
+ private long readWord(final int position) {
+ if(position < 0) {
+ throw new ArrayIndexOutOfBoundsException(position);
+ }
+
+ // First bit of the word
+ final long firstBitIndex = (position * wordLength);
+ final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
+ final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
+
+ // Last bit of the word
+ final long lastBitIndex = (firstBitIndex + wordLength - 1);
+ final int lastByteIndex = (bytePadding + (int)(lastBitIndex / BITS_PER_BYTE));
+ final int lastByteBitsToConsume;
+
+ final int bitsAfterByteBoundary = (int)((lastBitIndex + 1) % BITS_PER_BYTE);
+ // If the word terminates at the end of the last byte, consume the whole
+ // last byte.
+ if(bitsAfterByteBoundary == 0) {
+ lastByteBitsToConsume = BITS_PER_BYTE;
+ } else {
+ // Otherwise, only consume what is necessary.
+ lastByteBitsToConsume = bitsAfterByteBoundary;
+ }
+
+ if(lastByteIndex >= bytes.length) {
+ throw new ArrayIndexOutOfBoundsException("Word out of bounds of backing array.");
+ }
+
+ // Accumulator
+ long value = 0;
+
+ // --------------------------------------------------------------------
+ // First byte
+ final int bitsRemainingInFirstByte = (BITS_PER_BYTE - firstByteSkipBits);
+ final int bitsToConsumeInFirstByte = Math.min(bitsRemainingInFirstByte, wordLength);
+ long firstByte = (long)bytes[firstByteIndex];
+
+ // Mask off the bits to skip in the first byte.
+ final long firstByteMask = ((1L << bitsRemainingInFirstByte) - 1L);
+ firstByte &= firstByteMask;
+ // Right-align relevant bits of first byte.
+ firstByte >>>= (bitsRemainingInFirstByte - bitsToConsumeInFirstByte);
+
+ value |= firstByte;
+
+ // If the first byte contains the whole word, short-circuit.
+ if(firstByteIndex == lastByteIndex) {
+ return value;
+ }
+
+ // --------------------------------------------------------------------
+ // Middle bytes
+ final int middleByteCount = (lastByteIndex - firstByteIndex - 1);
+ for(int i=0; i<middleByteCount; i++) {
+ final long middleByte = (bytes[firstByteIndex + i + 1] & BYTE_MASK);
+ // Push middle byte onto accumulator.
+ value <<= BITS_PER_BYTE;
+ value |= middleByte;
+ }
+
+ // --------------------------------------------------------------------
+ // Last byte
+ long lastByte = (bytes[lastByteIndex] & BYTE_MASK);
+ lastByte >>= (BITS_PER_BYTE - lastByteBitsToConsume);
+ value <<= lastByteBitsToConsume;
+ value |= lastByte;
+ return value;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordDeserializer#totalWordCount()
+ */
+ @Override
+ public int totalWordCount() {
+ return wordCount;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java b/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
new file mode 100644
index 0000000..6bf46fc
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/BigEndianAscendingWordSerializer.java
@@ -0,0 +1,174 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A serializer that writes a sequence of fixed bit-width 'words' to a byte array.
+ * Bitwise OR is used to write words into bytes, so a low bit in a word is also
+ * a low bit in a byte. However, a high byte in a word is written at a lower index
+ * in the array than a low byte in a word. The first word is written at the lowest
+ * array index. Each serializer is one time use and returns its backing byte
+ * array.<p/>
+ *
+ * This encoding was chosen so that when reading bytes as octets in the typical
+ * first-octet-is-the-high-nibble fashion, an octet-to-binary conversion
+ * would yield a high-to-low, left-to-right view of the "short words".<p/>
+ *
+ * Example:<p/>
+ *
+ * Say short words are 5 bits wide. Our word sequence is the values
+ * <code>[31, 1, 5]</code>. In big-endian binary format, the values are
+ * <code>[0b11111, 0b00001, 0b00101]</code>. We use 15 of 16 bits in two bytes
+ * and pad the last (lowest) bit of the last byte with a zero:
+ *
+ * <code>
+ * [0b11111000, 0b01001010] = [0xF8, 0x4A]
+ * </code>.
+ */
+class BigEndianAscendingWordSerializer implements IWordSerializer {
+ // The number of bits per byte.
+ private static final int BITS_PER_BYTE = 8;
+
+ // ************************************************************************
+ // The length in bits of the words to be written.
+ private final int wordLength;
+ // The number of words to be written.
+ private final int wordCount;
+
+ // The byte array to which the words are serialized.
+ private final byte[] bytes;
+
+ // ------------------------------------------------------------------------
+ // Write state
+ // Number of bits that remain writable in the current byte.
+ private int bitsLeftInByte;
+ // Index of byte currently being written to.
+ private int byteIndex;
+ // Number of words written.
+ private int wordsWritten;
+
+ // ========================================================================
+ /**
+ * @param wordLength the length in bits of the words to be serialized. Must
+ * be greater than or equal to 1 and less than or equal to 64.
+ * @param wordCount the number of words to be serialized. Must be greater than
+ * or equal to zero.
+ * @param bytePadding the number of leading bytes that should pad the
+ * serialized words. Must be greater than or equal to zero.
+ */
+ public BigEndianAscendingWordSerializer(final int wordLength, final int wordCount, final int bytePadding) {
+ if((wordLength < 1) || (wordLength > 64)) {
+ throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
+ }
+ if(wordCount < 0) {
+ throw new IllegalArgumentException("Word count must be >= 0. (was: " + wordCount + ")");
+ }
+ if(bytePadding < 0) {
+ throw new IllegalArgumentException("Byte padding must be must be >= 0. (was: " + bytePadding + ")");
+ }
+
+ this.wordLength = wordLength;
+ this.wordCount = wordCount;
+
+ final long bitsRequired = (wordLength * wordCount);
+ final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
+ final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
+ bytes = new byte[bytesRequired];
+
+ bitsLeftInByte = BITS_PER_BYTE;
+ byteIndex = bytePadding;
+ wordsWritten = 0;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordSerializer#writeWord(long)
+ * @throws RuntimeException if the number of words written is greater than the
+ * <code>wordCount</code> parameter in the constructor.
+ */
+ @Override
+ public void writeWord(final long word) {
+ if(wordsWritten == wordCount) {
+ throw new RuntimeException("Cannot write more words, backing array full!");
+ }
+
+ int bitsLeftInWord = wordLength;
+
+ while(bitsLeftInWord > 0) {
+ // Move to the next byte if the current one is fully packed.
+ if(bitsLeftInByte == 0) {
+ byteIndex++;
+ bitsLeftInByte = BITS_PER_BYTE;
+ }
+
+ final long consumedMask;
+ if(bitsLeftInWord == 64) {
+ consumedMask = ~0L;
+ } else {
+ consumedMask = ((1L << bitsLeftInWord) - 1L);
+ }
+
+ // Fix how many bits will be written in this cycle. Choose the
+ // smaller of the remaining bits in the word or byte.
+ final int numberOfBitsToWrite = Math.min(bitsLeftInByte, bitsLeftInWord);
+ final int bitsInByteRemainingAfterWrite = (bitsLeftInByte - numberOfBitsToWrite);
+
+ // In general, we write the highest bits of the word first, so we
+ // strip the highest bits that were consumed in previous cycles.
+ final long remainingBitsOfWordToWrite = (word & consumedMask);
+
+ final long bitsThatTheByteCanAccept;
+ // If there is more left in the word than can be written to this
+ // byte, shift off the bits that can't be written off the bottom.
+ if(bitsLeftInWord > numberOfBitsToWrite) {
+ bitsThatTheByteCanAccept = (remainingBitsOfWordToWrite >>> (bitsLeftInWord - bitsLeftInByte));
+ } else {
+ // If the byte can accept all remaining bits, there is no need
+ // to shift off the bits that won't be written in this cycle.
+ bitsThatTheByteCanAccept = remainingBitsOfWordToWrite;
+ }
+
+ // Align the word bits to write up against the byte bits that have
+ // already been written. This shift may do nothing if the remainder
+ // of the byte is being consumed in this cycle.
+ final long alignedBits = (bitsThatTheByteCanAccept << bitsInByteRemainingAfterWrite);
+
+ // Update the byte with the alignedBits.
+ bytes[byteIndex] |= (byte)alignedBits;
+
+ // Update state with bit count written.
+ bitsLeftInWord -= numberOfBitsToWrite;
+ bitsLeftInByte = bitsInByteRemainingAfterWrite;
+ }
+
+ wordsWritten ++;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IWordSerializer#getBytes()
+ * @throws RuntimeException if the number of words written is fewer than the
+ * <code>wordCount</code> parameter in the constructor.
+ */
+ @Override
+ public byte[] getBytes() {
+ if(wordsWritten < wordCount) {
+ throw new RuntimeException("Not all words have been written! (" + wordsWritten + "/" + wordCount + ")");
+ }
+
+ return bytes;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java b/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java
new file mode 100644
index 0000000..275d7a1
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/BitUtil.java
@@ -0,0 +1,71 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of bit utilities.
+ */
+class BitUtil {
+ /**
+ * The set of least-significant bits for a given <code>byte</code>. <code>-1</code>
+ * is used if no bits are set (so as to not be confused with "index of zero"
+ * meaning that the least significant bit is the 0th (1st) bit).
+ *
+ * @see #leastSignificantBit(long)
+ */
+ private static final int[] LEAST_SIGNIFICANT_BIT = {
+ -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+ };
+
+ /**
+ * Computes the least-significant bit of the specified <code>long</code>
+ * that is set to <code>1</code>. Zero-indexed.
+ *
+ * @param value the <code>long</code> whose least-significant bit is desired.
+ * @return the least-significant bit of the specified <code>long</code>.
+ * <code>-1</code> is returned if there are no bits set.
+ */
+ // REF: http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+ // REF: http://www-graphics.stanford.edu/~seander/bithacks.html
+ public static int leastSignificantBit(final long value) {
+ if(value == 0L) return -1/*by contract*/;
+ if((value & 0xFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 0) & 0xFF)] + 0;
+ if((value & 0xFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 8) & 0xFF)] + 8;
+ if((value & 0xFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 16) & 0xFF)] + 16;
+ if((value & 0xFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 24) & 0xFF)] + 24;
+ if((value & 0xFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 32) & 0xFF)] + 32;
+ if((value & 0xFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 40) & 0xFF)] + 40;
+ if((value & 0xFFFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 48) & 0xFF)] + 48;
+ return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 56) & 0xFFL)] + 56;
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/BitVector.java b/solr/core/src/java/org/apache/solr/util/hll/BitVector.java
new file mode 100644
index 0000000..cd18570
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/BitVector.java
@@ -0,0 +1,259 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A vector (array) of bits that is accessed in units ("registers") of <code>width</code>
+ * bits which are stored as 64bit "words" (<code>long</code>s). In this context
+ * a register is at most 64bits.
+ */
+class BitVector implements Cloneable {
+ // NOTE: in this context, a word is 64bits
+
+ // rather than doing division to determine how a bit index fits into 64bit
+ // words (i.e. longs), bit shifting is used
+ private static final int LOG2_BITS_PER_WORD = 6/*=>64bits*/;
+ private static final int BITS_PER_WORD = 1 << LOG2_BITS_PER_WORD;
+ private static final int BITS_PER_WORD_MASK = BITS_PER_WORD - 1;
+
+ // ditto from above but for bytes (for output)
+ private static final int LOG2_BITS_PER_BYTE = 3/*=>8bits*/;
+ public static final int BITS_PER_BYTE = 1 << LOG2_BITS_PER_BYTE;
+
+ // ========================================================================
+ public static final int BYTES_PER_WORD = 8/*8 bytes in a long*/;
+
+ // ************************************************************************
+ // 64bit words
+ private final long[] words;
+ public final long[] words() { return words; }
+ public final int wordCount() { return words.length; }
+ public final int byteCount() { return wordCount() * BYTES_PER_WORD; }
+
+ // the width of a register in bits (this cannot be more than 64 (the word size))
+ private final int registerWidth;
+ public final int registerWidth() { return registerWidth; }
+
+ private final long count;
+
+ // ------------------------------------------------------------------------
+ private final long registerMask;
+
+ // ========================================================================
+ /**
+ * @param width the width of each register. This cannot be negative or
+ * zero or greater than 63 (the signed word size).
+ * @param count the number of registers. This cannot be negative or zero
+ */
+ public BitVector(final int width, final long count) {
+ // ceil((width * count)/BITS_PER_WORD)
+ this.words = new long[(int)(((width * count) + BITS_PER_WORD_MASK) >>> LOG2_BITS_PER_WORD)];
+ this.registerWidth = width;
+ this.count = count;
+
+ this.registerMask = (1L << width) - 1;
+ }
+
+ // ========================================================================
+ /**
+ * @param registerIndex the index of the register whose value is to be
+ * retrieved. This cannot be negative.
+ * @return the value at the specified register index
+ * @see #setRegister(long, long)
+ * @see #setMaxRegister(long, long)
+ */
+ // NOTE: if this changes then setMaxRegister() must change
+ public long getRegister(final long registerIndex) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ if(firstWordIndex == secondWordIndex)
+ return ((words[firstWordIndex] >>> bitRemainder) & registerMask);
+ /* else -- register spans words */
+ return (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
+ | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
+ }
+
+ /**
+ * @param registerIndex the index of the register whose value is to be set.
+ * This cannot be negative
+ * @param value the value to set in the register
+ * @see #getRegister(long)
+ * @see #setMaxRegister(long, long)
+ */
+ // NOTE: if this changes then setMaxRegister() must change
+ public void setRegister(final long registerIndex, final long value) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ final long words[] = this.words/*for convenience/performance*/;
+ if(firstWordIndex == secondWordIndex) {
+ // clear then set
+ words[firstWordIndex] &= ~(registerMask << bitRemainder);
+ words[firstWordIndex] |= (value << bitRemainder);
+ } else {/*register spans words*/
+ // clear then set each partial word
+ words[firstWordIndex] &= (1L << bitRemainder) - 1;
+ words[firstWordIndex] |= (value << bitRemainder);
+
+ words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
+ words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * @return a <code>LongIterator</code> for iterating starting at the register
+ * with index zero. This will never be <code>null</code>.
+ */
+ public LongIterator registerIterator() {
+ return new LongIterator() {
+ final int registerWidth = BitVector.this.registerWidth;
+ final long[] words = BitVector.this.words;
+ final long registerMask = BitVector.this.registerMask;
+
+ // register setup
+ long registerIndex = 0;
+ int wordIndex = 0;
+ int remainingWordBits = BITS_PER_WORD;
+ long word = words[wordIndex];
+
+ @Override public long next() {
+ long register;
+ if(remainingWordBits >= registerWidth) {
+ register = word & registerMask;
+
+ // shift to the next register
+ word >>>= registerWidth;
+ remainingWordBits -= registerWidth;
+ } else { /*insufficient bits remaining in current word*/
+ wordIndex++/*move to the next word*/;
+
+ register = (word | (words[wordIndex] << remainingWordBits)) & registerMask;
+
+ // shift to the next partial register (word)
+ word = words[wordIndex] >>> (registerWidth - remainingWordBits);
+ remainingWordBits += BITS_PER_WORD - registerWidth;
+ }
+ registerIndex++;
+ return register;
+ }
+
+ @Override public boolean hasNext() {
+ return registerIndex < count;
+ }
+ };
+ }
+
+ // ------------------------------------------------------------------------
+ // composite accessors
+ /**
+ * Sets the value of the specified index register if and only if the specified
+ * value is greater than the current value in the register. This is equivalent
+ * to but much more performant than:<p/>
+ *
+ * <pre>vector.setRegister(index, Math.max(vector.getRegister(index), value));</pre>
+ *
+ * @param registerIndex the index of the register whose value is to be set.
+ * This cannot be negative
+ * @param value the value to set in the register if and only if this value
+ * is greater than the current value in the register
+ * @return <code>true</code> if and only if the specified value is greater
+ * than or equal to the current register value. <code>false</code>
+ * otherwise.
+ * @see #getRegister(long)
+ * @see #setRegister(long, long)
+ * @see java.lang.Math#max(long, long)
+ */
+ // NOTE: if this changes then setRegister() must change
+ public boolean setMaxRegister(final long registerIndex, final long value) {
+ final long bitIndex = registerIndex * registerWidth;
+ final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
+ final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
+ final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
+
+ // NOTE: matches getRegister()
+ final long registerValue;
+ final long words[] = this.words/*for convenience/performance*/;
+ if(firstWordIndex == secondWordIndex)
+ registerValue = ((words[firstWordIndex] >>> bitRemainder) & registerMask);
+ else /*register spans words*/
+ registerValue = (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
+ | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
+
+ // determine which is the larger and update as necessary
+ if(value > registerValue) {
+ // NOTE: matches setRegister()
+ if(firstWordIndex == secondWordIndex) {
+ // clear then set
+ words[firstWordIndex] &= ~(registerMask << bitRemainder);
+ words[firstWordIndex] |= (value << bitRemainder);
+ } else {/*register spans words*/
+ // clear then set each partial word
+ words[firstWordIndex] &= (1L << bitRemainder) - 1;
+ words[firstWordIndex] |= (value << bitRemainder);
+
+ words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
+ words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
+ }
+ } /* else -- the register value is greater (or equal) so nothing needs to be done */
+
+ return (value >= registerValue);
+ }
+
+ // ========================================================================
+ /**
+ * Fills this bit vector with the specified bit value. This can be used to
+ * clear the vector by specifying <code>0</code>.
+ *
+ * @param value the value to set all bits to (only the lowest bit is used)
+ */
+ public void fill(final long value) {
+ for(long i=0; i<count; i++) {
+ setRegister(i, value);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Serializes the registers of the vector using the specified serializer.
+ *
+ * @param serializer the serializer to use. This cannot be <code>null</code>.
+ */
+ public void getRegisterContents(final IWordSerializer serializer) {
+ for(final LongIterator iter = registerIterator(); iter.hasNext();) {
+ serializer.writeWord(iter.next());
+ }
+ }
+
+ /**
+ * Creates a deep copy of this vector.
+ *
+ * @see java.lang.Object#clone()
+ */
+ @Override
+ public BitVector clone() {
+ final BitVector copy = new BitVector(registerWidth, count);
+ System.arraycopy(words, 0, copy.words, 0, words.length);
+ return copy;
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/HLL.java b/solr/core/src/java/org/apache/solr/util/hll/HLL.java
new file mode 100644
index 0000000..432401b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/HLL.java
@@ -0,0 +1,1071 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.LongOpenHashSet;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.hppc.cursors.LongCursor;
+
+/**
+ * A probabilistic set of hashed <code>long</code> elements. Useful for computing
+ * the approximate cardinality of a stream of data in very small storage.
+ *
+ * A modified version of the <a href="http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf">
+ * 'HyperLogLog' data structure and algorithm</a> is used, which combines both
+ * probabilistic and non-probabilistic techniques to improve the accuracy and
+ * storage requirements of the original algorithm.
+ *
+ * More specifically, initializing and storing a new {@link HLL} will
+ * allocate a sentinel value symbolizing the empty set ({@link HLLType#EMPTY}).
+ * After adding the first few values, a sorted list of unique integers is
+ * stored in a {@link HLLType#EXPLICIT} hash set. When configured, accuracy can
+ * be sacrificed for memory footprint: the values in the sorted list are
+ * "promoted" to a "{@link HLLType#SPARSE}" map-based HyperLogLog structure.
+ * Finally, when enough registers are set, the map-based HLL will be converted
+ * to a bit-packed "{@link HLLType#FULL}" HyperLogLog structure.
+ *
+ * This data structure is interoperable with the implementations found at:
+ * <ul>
+ * <li><a href="https://github.com/aggregateknowledge/postgresql-hll">postgresql-hll</a>, and</li>
+ * <li><a href="https://github.com/aggregateknowledge/js-hll">js-hll</a></li>
+ * </ul>
+ * when <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown">properly serialized</a>.
+ */
+public class HLL implements Cloneable {
+ // minimum and maximum values for the log-base-2 of the number of registers
+ // in the HLL
+ public static final int MINIMUM_LOG2M_PARAM = 4;
+ public static final int MAXIMUM_LOG2M_PARAM = 30;
+
+ // minimum and maximum values for the register width of the HLL
+ public static final int MINIMUM_REGWIDTH_PARAM = 1;
+ public static final int MAXIMUM_REGWIDTH_PARAM = 8;
+
+ // minimum and maximum values for the 'expthresh' parameter of the
+ // constructor that is meant to match the PostgreSQL implementation's
+ // constructor and parameter names
+ public static final int MINIMUM_EXPTHRESH_PARAM = -1;
+ public static final int MAXIMUM_EXPTHRESH_PARAM = 18;
+ public static final int MAXIMUM_EXPLICIT_THRESHOLD = (1 << (MAXIMUM_EXPTHRESH_PARAM - 1)/*per storage spec*/);
+
+ // ************************************************************************
+ // Storage
+ // storage used when #type is EXPLICIT, null otherwise
+ LongOpenHashSet explicitStorage;
+ // storage used when #type is SPARSE, null otherwise
+ IntByteOpenHashMap sparseProbabilisticStorage;
+ // storage used when #type is FULL, null otherwise
+ BitVector probabilisticStorage;
+
+ // current type of this HLL instance, if this changes then so should the
+ // storage used (see above)
+ private HLLType type;
+
+ // ------------------------------------------------------------------------
+ // Characteristic parameters
+ // NOTE: These members are named to match the PostgreSQL implementation's
+ // parameters.
+ // log2(the number of probabilistic HLL registers)
+ private final int log2m;
+ // the size (width) each register in bits
+ private final int regwidth;
+
+ // ------------------------------------------------------------------------
+ // Computed constants
+ // ........................................................................
+ // EXPLICIT-specific constants
+ // flag indicating if the EXPLICIT representation should NOT be used
+ private final boolean explicitOff;
+ // flag indicating that the promotion threshold from EXPLICIT should be
+ // computed automatically
+ // NOTE: this only has meaning when 'explicitOff' is false
+ private final boolean explicitAuto;
+ // threshold (in element count) at which a EXPLICIT HLL is converted to a
+ // SPARSE or FULL HLL, always greater than or equal to zero and always a
+ // power of two OR simply zero
+ // NOTE: this only has meaning when 'explicitOff' is false
+ private final int explicitThreshold;
+
+ // ........................................................................
+ // SPARSE-specific constants
+ // the computed width of the short words
+ private final int shortWordLength;
+ // flag indicating if the SPARSE representation should not be used
+ private final boolean sparseOff;
+ // threshold (in register count) at which a SPARSE HLL is converted to a
+ // FULL HLL, always greater than zero
+ private final int sparseThreshold;
+
+ // ........................................................................
+ // Probabilistic algorithm constants
+ // the number of registers, will always be a power of 2
+ private final int m;
+ // a mask of the log2m bits set to one and the rest to zero
+ private final int mBitsMask;
+ // a mask as wide as a register (see #fromBytes())
+ private final int valueMask;
+ // mask used to ensure that p(w) does not overflow register (see #Constructor() and #addRaw())
+ private final long pwMaxMask;
+ // alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
+ private final double alphaMSquared;
+ // the cutoff value of the estimator for using the "small" range cardinality
+ // correction formula
+ private final double smallEstimatorCutoff;
+ // the cutoff value of the estimator for using the "large" range cardinality
+ // correction formula
+ private final double largeEstimatorCutoff;
+
+ // ========================================================================
+ /**
+ * NOTE: Arguments here are named and structured identically to those in the
+ * PostgreSQL implementation, which can be found
+ * <a href="https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning">here</a>.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ * @param expthresh tunes when the {@link HLLType#EXPLICIT} to
+ * {@link HLLType#SPARSE} promotion occurs,
+ * based on the set's cardinality. Must be at least -1 and at most 18.
+ * @param sparseon Flag indicating if the {@link HLLType#SPARSE}
+ * representation should be used.
+ * @param type the type in the promotion hierarchy which this instance should
+ * start at. This cannot be <code>null</code>.
+ */
+ public HLL(final int log2m, final int regwidth, final int expthresh, final boolean sparseon, final HLLType type) {
+ this.log2m = log2m;
+ if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+ throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+ }
+
+ this.regwidth = regwidth;
+ if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+ throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+ }
+
+ this.m = (1 << log2m);
+ this.mBitsMask = m - 1;
+ this.valueMask = (1 << regwidth) - 1;
+ this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+ this.alphaMSquared = HLLUtil.alphaMSquared(m);
+ this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+ this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+ if(expthresh == -1) {
+ this.explicitAuto = true;
+ this.explicitOff = false;
+
+ // NOTE: This math matches the size calculation in the PostgreSQL impl.
+ final long fullRepresentationSize = (this.regwidth * (long)this.m + 7/*round up to next whole byte*/)/Byte.SIZE;
+ final int numLongs = (int)(fullRepresentationSize / 8/*integer division to round down*/);
+
+ if(numLongs > MAXIMUM_EXPLICIT_THRESHOLD) {
+ this.explicitThreshold = MAXIMUM_EXPLICIT_THRESHOLD;
+ } else {
+ this.explicitThreshold = numLongs;
+ }
+ } else if(expthresh == 0) {
+ this.explicitAuto = false;
+ this.explicitOff = true;
+ this.explicitThreshold = 0;
+ } else if((expthresh > 0) && (expthresh <= MAXIMUM_EXPTHRESH_PARAM)){
+ this.explicitAuto = false;
+ this.explicitOff = false;
+ this.explicitThreshold = (1 << (expthresh - 1));
+ } else {
+ throw new IllegalArgumentException("'expthresh' must be at least " + MINIMUM_EXPTHRESH_PARAM + " and at most " + MAXIMUM_EXPTHRESH_PARAM + " (was: " + expthresh + ")");
+ }
+
+ this.shortWordLength = (regwidth + log2m);
+ this.sparseOff = !sparseon;
+ if(this.sparseOff) {
+ this.sparseThreshold = 0;
+ } else {
+ // TODO improve this cutoff to include the cost overhead of Java
+ // members/objects
+ final int largestPow2LessThanCutoff =
+ (int)NumberUtil.log2((this.m * this.regwidth) / this.shortWordLength);
+ this.sparseThreshold = (1 << largestPow2LessThanCutoff);
+ }
+
+ initializeStorage(type);
+ }
+
+ /**
+ * Construct an empty HLL with the given {@code log2m} and {@code regwidth}.
+ *
+ * This is equivalent to calling <code>HLL(log2m, regwidth, -1, true, HLLType.EMPTY)</code>.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ *
+ * @see #HLL(int, int, int, boolean, HLLType)
+ */
+ public HLL(final int log2m, final int regwidth) {
+ this(log2m, regwidth, -1, true, HLLType.EMPTY);
+ }
+
+ // -------------------------------------------------------------------------
+ /**
+ * Convenience constructor for testing. Assumes that both {@link HLLType#EXPLICIT}
+ * and {@link HLLType#SPARSE} representations should be enabled.
+ *
+ * @param log2m log-base-2 of the number of registers used in the HyperLogLog
+ * algorithm. Must be at least 4 and at most 30.
+ * @param regwidth number of bits used per register in the HyperLogLog
+ * algorithm. Must be at least 1 and at most 8.
+ * @param explicitThreshold cardinality threshold at which the {@link HLLType#EXPLICIT}
+ * representation should be promoted to {@link HLLType#SPARSE}.
+ * This must be greater than zero and less than or equal to {@value #MAXIMUM_EXPLICIT_THRESHOLD}.
+ * @param sparseThreshold register count threshold at which the {@link HLLType#SPARSE}
+ * representation should be promoted to {@link HLLType#FULL}.
+ * This must be greater than zero.
+ * @param type the type in the promotion hierarchy which this instance should
+ * start at. This cannot be <code>null</code>.
+ */
+ /*package, for testing*/ HLL(final int log2m, final int regwidth, final int explicitThreshold, final int sparseThreshold, final HLLType type) {
+ this.log2m = log2m;
+ if((log2m < MINIMUM_LOG2M_PARAM) || (log2m > MAXIMUM_LOG2M_PARAM)) {
+ throw new IllegalArgumentException("'log2m' must be at least " + MINIMUM_LOG2M_PARAM + " and at most " + MAXIMUM_LOG2M_PARAM + " (was: " + log2m + ")");
+ }
+
+ this.regwidth = regwidth;
+ if((regwidth < MINIMUM_REGWIDTH_PARAM) || (regwidth > MAXIMUM_REGWIDTH_PARAM)) {
+ throw new IllegalArgumentException("'regwidth' must be at least " + MINIMUM_REGWIDTH_PARAM + " and at most " + MAXIMUM_REGWIDTH_PARAM + " (was: " + regwidth + ")");
+ }
+
+ this.m = (1 << log2m);
+ this.mBitsMask = m - 1;
+ this.valueMask = (1 << regwidth) - 1;
+ this.pwMaxMask = HLLUtil.pwMaxMask(regwidth);
+ this.alphaMSquared = HLLUtil.alphaMSquared(m);
+ this.smallEstimatorCutoff = HLLUtil.smallEstimatorCutoff(m);
+ this.largeEstimatorCutoff = HLLUtil.largeEstimatorCutoff(log2m, regwidth);
+
+ this.explicitAuto = false;
+ this.explicitOff = false;
+ this.explicitThreshold = explicitThreshold;
+ if((explicitThreshold < 1) || (explicitThreshold > MAXIMUM_EXPLICIT_THRESHOLD)) {
+ throw new IllegalArgumentException("'explicitThreshold' must be at least 1 and at most " + MAXIMUM_EXPLICIT_THRESHOLD + " (was: " + explicitThreshold + ")");
+ }
+
+ this.shortWordLength = (regwidth + log2m);
+ this.sparseOff = false;
+ this.sparseThreshold = sparseThreshold;
+
+ initializeStorage(type);
+ }
+
+ /**
+ * @return the type in the promotion hierarchy of this instance. This will
+ * never be <code>null</code>.
+ */
+ public HLLType getType() { return type; }
+
+ // ========================================================================
+ // Add
+ /**
+ * Adds <code>rawValue</code> directly to the HLL.
+ *
+ * @param rawValue the value to be added. It is very important that this
+ * value <em>already be hashed</em> with a strong (but not
+ * necessarily cryptographic) hash function. For instance, the
+ * Murmur3 implementation in
+ * <a href="http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java">
+ * Google's Guava</a> library is an excellent hash function for this
+ * purpose and, for seeds greater than zero, matches the output
+ * of the hash provided in the PostgreSQL implementation.
+ */
+ public void addRaw(final long rawValue) {
+ switch(type) {
+ case EMPTY: {
+ // NOTE: EMPTY type is always promoted on #addRaw()
+ if(explicitThreshold > 0) {
+ initializeStorage(HLLType.EXPLICIT);
+ explicitStorage.add(rawValue);
+ } else if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ addRawSparseProbabilistic(rawValue);
+ } else {
+ initializeStorage(HLLType.FULL);
+ addRawProbabilistic(rawValue);
+ }
+ return;
+ }
+ case EXPLICIT: {
+ explicitStorage.add(rawValue);
+
+ // promotion, if necessary
+ if(explicitStorage.size() > explicitThreshold) {
+ if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ for (LongCursor c : explicitStorage) {
+ addRawSparseProbabilistic(c.value);
+ }
+ } else {
+ initializeStorage(HLLType.FULL);
+ for (LongCursor c : explicitStorage) {
+ addRawProbabilistic(c.value);
+ }
+ }
+ explicitStorage = null;
+ }
+ return;
+ }
+ case SPARSE: {
+ addRawSparseProbabilistic(rawValue);
+
+ // promotion, if necessary
+ if(sparseProbabilisticStorage.size() > sparseThreshold) {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ }
+ case FULL:
+ addRawProbabilistic(rawValue);
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // #addRaw(..) helpers
+ /**
+ * Adds the raw value to the {@link #sparseProbabilisticStorage}.
+ * {@link #type} must be {@link HLLType#SPARSE}.
+ *
+ * @param rawValue the raw value to add to the sparse storage.
+ */
+ private void addRawSparseProbabilistic(final long rawValue) {
+ // p(w): position of the least significant set bit (one-indexed)
+ // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+ //
+ // By construction of pwMaxMask (see #Constructor()),
+ // lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+ // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+ // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if(substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the multiset simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+ }
+
+ // Short-circuit if the register is being set to zero, since algorithmically
+ // this corresponds to an "unset" register, and "unset" registers aren't
+ // stored to save memory. (The very reason this sparse implementation
+ // exists.) If a register is set to zero it will break the #algorithmCardinality
+ // code.
+ if(p_w == 0) {
+ return;
+ }
+
+ // NOTE: no +1 as in paper since 0-based indexing
+ final int j = (int)(rawValue & mBitsMask);
+
+ final byte currentValue;
+ if (sparseProbabilisticStorage.containsKey(j)) {
+ currentValue = sparseProbabilisticStorage.lget();
+ } else {
+ currentValue = 0;
+ }
+
+ if(p_w > currentValue) {
+ sparseProbabilisticStorage.put(j, p_w);
+ }
+ }
+
+ /**
+ * Adds the raw value to the {@link #probabilisticStorage}.
+ * {@link #type} must be {@link HLLType#FULL}.
+ *
+ * @param rawValue the raw value to add to the full probabilistic storage.
+ */
+ private void addRawProbabilistic(final long rawValue) {
+ // p(w): position of the least significant set bit (one-indexed)
+ // By contract: p(w) <= 2^(registerValueInBits) - 1 (the max register value)
+ //
+ // By construction of pwMaxMask (see #Constructor()),
+ // lsb(pwMaxMask) = 2^(registerValueInBits) - 2,
+ // thus lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) - 2,
+ // thus 1 + lsb(any_long | pwMaxMask) <= 2^(registerValueInBits) -1.
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if (substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the multiset simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)(1 + BitUtil.leastSignificantBit(substreamValue| pwMaxMask));
+ }
+
+ // Short-circuit if the register is being set to zero, since algorithmically
+ // this corresponds to an "unset" register, and "unset" registers aren't
+ // stored to save memory. (The very reason this sparse implementation
+ // exists.) If a register is set to zero it will break the #algorithmCardinality
+ // code.
+ if(p_w == 0) {
+ return;
+ }
+
+ // NOTE: no +1 as in paper since 0-based indexing
+ final int j = (int)(rawValue & mBitsMask);
+
+ probabilisticStorage.setMaxRegister(j, p_w);
+ }
+
+ // ------------------------------------------------------------------------
+ // Storage helper
+ /**
+ * Initializes storage for the specified {@link HLLType} and changes the
+ * instance's {@link #type}.
+ *
+ * @param type the {@link HLLType} to initialize storage for. This cannot be
+ * <code>null</code> and must be an instantiable type.
+ */
+ private void initializeStorage(final HLLType type) {
+ this.type = type;
+ switch(type) {
+ case EMPTY:
+ // nothing to be done
+ break;
+ case EXPLICIT:
+ this.explicitStorage = new LongOpenHashSet();
+ break;
+ case SPARSE:
+ this.sparseProbabilisticStorage = new IntByteOpenHashMap();
+ break;
+ case FULL:
+ this.probabilisticStorage = new BitVector(regwidth, m);
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Cardinality
+ /**
+ * Computes the cardinality of the HLL.
+ *
+ * @return the cardinality of HLL. This will never be negative.
+ */
+ public long cardinality() {
+ switch(type) {
+ case EMPTY:
+ return 0/*by definition*/;
+ case EXPLICIT:
+ return explicitStorage.size();
+ case SPARSE:
+ return (long)Math.ceil(sparseProbabilisticAlgorithmCardinality());
+ case FULL:
+ return (long)Math.ceil(fullProbabilisticAlgorithmCardinality());
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Cardinality helpers
+ /**
+ * Computes the exact cardinality value returned by the HLL algorithm when
+ * represented as a {@link HLLType#SPARSE} HLL. Kept
+ * separate from {@link #cardinality()} for testing purposes. {@link #type}
+ * must be {@link HLLType#SPARSE}.
+ *
+ * @return the exact, unrounded cardinality given by the HLL algorithm
+ */
+ /*package, for testing*/ double sparseProbabilisticAlgorithmCardinality() {
+ final int m = this.m/*for performance*/;
+
+ // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+ // 'j'th register value
+ double sum = 0;
+ int numberOfZeroes = 0/*"V" in the paper*/;
+ for(int j=0; j<m; j++) {
+ final long register;
+ if (sparseProbabilisticStorage.containsKey(j)) {
+ register = sparseProbabilisticStorage.lget();
+ } else {
+ register = 0;
+ }
+
+ sum += 1.0 / (1L << register);
+ if(register == 0L) numberOfZeroes++;
+ }
+
+ // apply the estimate and correction to the indicator function
+ final double estimator = alphaMSquared / sum;
+ if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+ return HLLUtil.smallEstimator(m, numberOfZeroes);
+ } else if(estimator <= largeEstimatorCutoff) {
+ return estimator;
+ } else {
+ return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+ }
+ }
+
+ /**
+ * Computes the exact cardinality value returned by the HLL algorithm when
+ * represented as a {@link HLLType#FULL} HLL. Kept
+ * separate from {@link #cardinality()} for testing purposes. {@link #type}
+ * must be {@link HLLType#FULL}.
+ *
+ * @return the exact, unrounded cardinality given by the HLL algorithm
+ */
+ /*package, for testing*/ double fullProbabilisticAlgorithmCardinality() {
+ final int m = this.m/*for performance*/;
+
+ // compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
+ // 'j'th register value
+ double sum = 0;
+ int numberOfZeroes = 0/*"V" in the paper*/;
+ final LongIterator iterator = probabilisticStorage.registerIterator();
+ while(iterator.hasNext()) {
+ final long register = iterator.next();
+
+ sum += 1.0 / (1L << register);
+ if(register == 0L) numberOfZeroes++;
+ }
+
+ // apply the estimate and correction to the indicator function
+ final double estimator = alphaMSquared / sum;
+ if((numberOfZeroes != 0) && (estimator < smallEstimatorCutoff)) {
+ return HLLUtil.smallEstimator(m, numberOfZeroes);
+ } else if(estimator <= largeEstimatorCutoff) {
+ return estimator;
+ } else {
+ return HLLUtil.largeEstimator(log2m, regwidth, estimator);
+ }
+ }
+
+ // ========================================================================
+ // Clear
+ /**
+ * Clears the HLL. The HLL will have cardinality zero and will act as if no
+ * elements have been added.
+ *
+ * NOTE: Unlike {@link #addRaw(long)}, <code>clear</code> does NOT handle
+ * transitions between {@link HLLType}s - a probabilistic type will remain
+ * probabilistic after being cleared.
+ */
+ public void clear() {
+ switch(type) {
+ case EMPTY:
+ return /*do nothing*/;
+ case EXPLICIT:
+ explicitStorage.clear();
+ return;
+ case SPARSE:
+ sparseProbabilisticStorage.clear();
+ return;
+ case FULL:
+ probabilisticStorage.fill(0);
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Union
+ /**
+ * Computes the union of HLLs and stores the result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ public void union(final HLL other) {
+ // TODO: verify HLLs are compatible
+ final HLLType otherType = other.getType();
+
+ if(type.equals(otherType)) {
+ homogeneousUnion(other);
+ return;
+ } else {
+ heterogenousUnion(other);
+ return;
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ // Union helpers
+ /**
+ * Computes the union of two HLLs, of different types, and stores the
+ * result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ /*package, for testing*/ void heterogenousUnion(final HLL other) {
+ /*
+ * The logic here is divided into two sections: unions with an EMPTY
+ * HLL, and unions between EXPLICIT/SPARSE/FULL
+ * HLL.
+ *
+ * Between those two sections, all possible heterogeneous unions are
+ * covered. Should another type be added to HLLType whose unions
+ * are not easily reduced (say, as EMPTY's are below) this may be more
+ * easily implemented as Strategies. However, that is unnecessary as it
+ * stands.
+ */
+
+ // ....................................................................
+ // Union with an EMPTY
+ if(HLLType.EMPTY.equals(type)) {
+ // NOTE: The union of empty with non-empty HLL is just a
+ // clone of the non-empty.
+
+ switch(other.getType()) {
+ case EXPLICIT: {
+ // src: EXPLICIT
+ // dest: EMPTY
+
+ if(other.explicitStorage.size() <= explicitThreshold) {
+ type = HLLType.EXPLICIT;
+ explicitStorage = other.explicitStorage.clone();
+ } else {
+ if(!sparseOff) {
+ initializeStorage(HLLType.SPARSE);
+ } else {
+ initializeStorage(HLLType.FULL);
+ }
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ }
+ return;
+ }
+ case SPARSE: {
+ // src: SPARSE
+ // dest: EMPTY
+
+ if(!sparseOff) {
+ type = HLLType.SPARSE;
+ sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+ } else {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ return;
+ }
+ default/*case FULL*/: {
+ // src: FULL
+ // dest: EMPTY
+
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ return;
+ }
+ }
+ } else if (HLLType.EMPTY.equals(other.getType())) {
+ // source is empty, so just return destination since it is unchanged
+ return;
+ } /* else -- both of the sets are not empty */
+
+ // ....................................................................
+ // NOTE: Since EMPTY is handled above, the HLLs are non-EMPTY below
+ switch(type) {
+ case EXPLICIT: {
+ // src: FULL/SPARSE
+ // dest: EXPLICIT
+ // "Storing into destination" cannot be done (since destination
+ // is by definition of smaller capacity than source), so a clone
+ // of source is made and values from destination are inserted
+ // into that.
+
+ // Determine source and destination storage.
+ // NOTE: destination storage may change through promotion if
+ // source is SPARSE.
+ if(HLLType.SPARSE.equals(other.getType())) {
+ if(!sparseOff) {
+ type = HLLType.SPARSE;
+ sparseProbabilisticStorage = other.sparseProbabilisticStorage.clone();
+ } else {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ } else /*source is HLLType.FULL*/ {
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ }
+ for(LongCursor c : explicitStorage) {
+ addRaw(c.value);
+ }
+ explicitStorage = null;
+ return;
+ }
+ case SPARSE: {
+ if(HLLType.EXPLICIT.equals(other.getType())) {
+ // src: EXPLICIT
+ // dest: SPARSE
+ // Add the raw values from the source to the destination.
+
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ // NOTE: addRaw will handle promotion cleanup
+ } else /*source is HLLType.FULL*/ {
+ // src: FULL
+ // dest: SPARSE
+ // "Storing into destination" cannot be done (since destination
+ // is by definition of smaller capacity than source), so a
+ // clone of source is made and registers from the destination
+ // are merged into the clone.
+
+ type = HLLType.FULL;
+ probabilisticStorage = other.probabilisticStorage.clone();
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ }
+ default/*destination is HLLType.FULL*/: {
+ if(HLLType.EXPLICIT.equals(other.getType())) {
+ // src: EXPLICIT
+ // dest: FULL
+ // Add the raw values from the source to the destination.
+ // Promotion is not possible, so don't bother checking.
+
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ } else /*source is HLLType.SPARSE*/ {
+ // src: SPARSE
+ // dest: FULL
+ // Merge the registers from the source into the destination.
+ // Promotion is not possible, so don't bother checking.
+
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Computes the union of two HLLs of the same type, and stores the
+ * result in this instance.
+ *
+ * @param other the other {@link HLL} instance to union into this one. This
+ * cannot be <code>null</code>.
+ */
+ private void homogeneousUnion(final HLL other) {
+ switch(type) {
+ case EMPTY:
+ // union of empty and empty is empty
+ return;
+ case EXPLICIT:
+ for(LongCursor c : other.explicitStorage) {
+ addRaw(c.value);
+ }
+ // NOTE: #addRaw() will handle promotion, if necessary
+ return;
+ case SPARSE:
+ for(IntByteCursor c : other.sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ final byte currentRegisterValue = sparseProbabilisticStorage.get(registerIndex);
+ if(registerValue > currentRegisterValue) {
+ sparseProbabilisticStorage.put(registerIndex, registerValue);
+ }
+ }
+
+ // promotion, if necessary
+ if(sparseProbabilisticStorage.size() > sparseThreshold) {
+ initializeStorage(HLLType.FULL);
+ for(IntByteCursor c : sparseProbabilisticStorage) {
+ final int registerIndex = c.key;
+ final byte registerValue = c.value;
+ probabilisticStorage.setMaxRegister(registerIndex, registerValue);
+ }
+ sparseProbabilisticStorage = null;
+ }
+ return;
+ case FULL:
+ for(int i=0; i<m; i++) {
+ final long registerValue = other.probabilisticStorage.getRegister(i);
+ probabilisticStorage.setMaxRegister(i, registerValue);
+ }
+ return;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ }
+
+ // ========================================================================
+ // Serialization
+ /**
+ * Serializes the HLL to an array of bytes in correspondence with the format
+ * of the default schema version, {@link SerializationUtil#DEFAULT_SCHEMA_VERSION}.
+ *
+ * @return the array of bytes representing the HLL. This will never be
+ * <code>null</code> or empty.
+ */
+ public byte[] toBytes() {
+ return toBytes(SerializationUtil.DEFAULT_SCHEMA_VERSION);
+ }
+
+ /**
+ * Serializes the HLL to an array of bytes in correspondence with the format
+ * of the specified schema version.
+ *
+ * @param schemaVersion the schema version dictating the serialization format
+ * @return the array of bytes representing the HLL. This will never be
+ * <code>null</code> or empty.
+ */
+ public byte[] toBytes(final ISchemaVersion schemaVersion) {
+ final byte[] bytes;
+ switch(type) {
+ case EMPTY:
+ bytes = new byte[schemaVersion.paddingBytes(type)];
+ break;
+ case EXPLICIT: {
+ final IWordSerializer serializer =
+ schemaVersion.getSerializer(type, Long.SIZE, explicitStorage.size());
+
+ final long[] values = explicitStorage.toArray();
+ Arrays.sort(values);
+ for(final long value : values) {
+ serializer.writeWord(value);
+ }
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ case SPARSE: {
+ final IWordSerializer serializer =
+ schemaVersion.getSerializer(type, shortWordLength, sparseProbabilisticStorage.size());
+
+ final int[] indices = sparseProbabilisticStorage.keys().toArray();
+ Arrays.sort(indices);
+ for(final int registerIndex : indices) {
+ assert sparseProbabilisticStorage.containsKey(registerIndex);
+ final long registerValue = sparseProbabilisticStorage.get(registerIndex);
+ // pack index and value into "short word"
+ final long shortWord = ((registerIndex << regwidth) | registerValue);
+ serializer.writeWord(shortWord);
+ }
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ case FULL: {
+ final IWordSerializer serializer = schemaVersion.getSerializer(type, regwidth, m);
+ probabilisticStorage.getRegisterContents(serializer);
+
+ bytes = serializer.getBytes();
+ break;
+ }
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ final IHLLMetadata metadata = new HLLMetadata(schemaVersion.schemaVersionNumber(),
+ type,
+ log2m,
+ regwidth,
+ (int)NumberUtil.log2(explicitThreshold),
+ explicitOff,
+ explicitAuto,
+ !sparseOff);
+ schemaVersion.writeMetadata(bytes, metadata);
+
+ return bytes;
+ }
+
+ /**
+ * Deserializes the HLL (in {@link #toBytes(ISchemaVersion)} format) serialized
+ * into <code>bytes</code>.
+ *
+ * @param bytes the serialized bytes of new HLL
+ * @return the deserialized HLL. This will never be <code>null</code>.
+ *
+ * @see #toBytes(ISchemaVersion)
+ */
+ public static HLL fromBytes(final byte[] bytes) {
+ final ISchemaVersion schemaVersion = SerializationUtil.getSchemaVersion(bytes);
+ final IHLLMetadata metadata = schemaVersion.readMetadata(bytes);
+
+ final HLLType type = metadata.HLLType();
+ final int regwidth = metadata.registerWidth();
+ final int log2m = metadata.registerCountLog2();
+ final boolean sparseon = metadata.sparseEnabled();
+
+ final int expthresh;
+ if(metadata.explicitAuto()) {
+ expthresh = -1;
+ } else if(metadata.explicitOff()) {
+ expthresh = 0;
+ } else {
+ // NOTE: take into account that the postgres-compatible constructor
+ // subtracts one before taking a power of two.
+ expthresh = metadata.log2ExplicitCutoff() + 1;
+ }
+
+ final HLL hll = new HLL(log2m, regwidth, expthresh, sparseon, type);
+
+ // Short-circuit on empty, which needs no other deserialization.
+ if(HLLType.EMPTY.equals(type)) {
+ return hll;
+ }
+
+ final int wordLength;
+ switch(type) {
+ case EXPLICIT:
+ wordLength = Long.SIZE;
+ break;
+ case SPARSE:
+ wordLength = hll.shortWordLength;
+ break;
+ case FULL:
+ wordLength = hll.regwidth;
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ final IWordDeserializer deserializer =
+ schemaVersion.getDeserializer(type, wordLength, bytes);
+ switch(type) {
+ case EXPLICIT:
+ // NOTE: This should not exceed expthresh and this will always
+ // be exactly the number of words that were encoded,
+ // because the word length is at least a byte wide.
+ // SEE: IWordDeserializer#totalWordCount()
+ for(int i=0; i<deserializer.totalWordCount(); i++) {
+ hll.explicitStorage.add(deserializer.readWord());
+ }
+ break;
+ case SPARSE:
+ // NOTE: If the shortWordLength were smaller than 8 bits
+ // (1 byte) there would be a possibility (because of
+ // padding arithmetic) of having one or more extra
+ // registers read. However, this is not relevant as the
+ // extra registers will be all zeroes, which are ignored
+ // in the sparse representation.
+ for(int i=0; i<deserializer.totalWordCount(); i++) {
+ final long shortWord = deserializer.readWord();
+ final byte registerValue = (byte)(shortWord & hll.valueMask);
+ // Only set non-zero registers.
+ if (registerValue != 0) {
+ hll.sparseProbabilisticStorage.put((int)(shortWord >>> hll.regwidth), registerValue);
+ }
+ }
+ break;
+ case FULL:
+ // NOTE: Iteration is done using m (register count) and NOT
+ // deserializer#totalWordCount() because regwidth may be
+ // less than 8 and as such the padding on the 'last' byte
+ // may be larger than regwidth, causing an extra register
+ // to be read.
+ // SEE: IWordDeserializer#totalWordCount()
+ for(long i=0; i<hll.m; i++) {
+ hll.probabilisticStorage.setRegister(i, deserializer.readWord());
+ }
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+
+ return hll;
+ }
+
+ /**
+ * Create a deep copy of this HLL.
+ *
+ * @see java.lang.Object#clone()
+ */
+ @Override
+ public HLL clone() throws CloneNotSupportedException {
+ // NOTE: Since the package-only constructor assumes both explicit and
+ // sparse are enabled, the easiest thing to do here is to re-derive
+ // the expthresh parameter and create a new HLL with the public
+ // constructor.
+ // TODO: add a more sensible constructor to make this less obfuscated
+ final int copyExpthresh;
+ if(explicitAuto) {
+ copyExpthresh = -1;
+ } else if(explicitOff) {
+ copyExpthresh = 0;
+ } else {
+ // explicitThreshold is defined as:
+ //
+ // this.explicitThreshold = (1 << (expthresh - 1));
+ //
+ // Since explicitThreshold is a power of two and only has a single
+ // bit set, finding the LSB is the same as finding the inverse
+ copyExpthresh = BitUtil.leastSignificantBit(explicitThreshold) + 1;
+ }
+ final HLL copy = new HLL(log2m, regwidth, copyExpthresh, !sparseOff/*sparseOn*/, type);
+ switch(type) {
+ case EMPTY:
+ // nothing to be done
+ break;
+ case EXPLICIT:
+ copy.explicitStorage = this.explicitStorage.clone();
+ break;
+ case SPARSE:
+ copy.sparseProbabilisticStorage = this.sparseProbabilisticStorage.clone();
+ break;
+ case FULL:
+ copy.probabilisticStorage = this.probabilisticStorage.clone();
+ break;
+ default:
+ throw new RuntimeException("Unsupported HLL type " + type);
+ }
+ return copy;
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java b/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java
new file mode 100644
index 0000000..1e4e4ad
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/HLLMetadata.java
@@ -0,0 +1,136 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A concrete {@link IHLLMetadata} implemented as a simple struct.
+ */
+class HLLMetadata implements IHLLMetadata {
+ private final int schemaVersion;
+ private final HLLType type;
+ private final int registerCountLog2;
+ private final int registerWidth;
+ private final int log2ExplicitCutoff;
+ private final boolean explicitOff;
+ private final boolean explicitAuto;
+ private final boolean sparseEnabled;
+
+ /**
+ * @param schemaVersion the schema version number of the HLL. This must
+ * be greater than or equal to zero.
+ * @param type the {@link HLLType type} of the HLL. This cannot
+ * be <code>null</code>.
+ * @param registerCountLog2 the log-base-2 register count parameter for
+ * probabilistic HLLs. This must be greater than or equal to zero.
+ * @param registerWidth the register width parameter for probabilistic
+ * HLLs. This must be greater than or equal to zero.
+ * @param log2ExplicitCutoff the log-base-2 of the explicit cardinality cutoff,
+ * if it is explicitly defined. (If <code>explicitOff</code> or
+ * <code>explicitAuto</code> is <code>true</code> then this has no
+ * meaning.)
+ * @param explicitOff the flag for 'explicit off'-mode, where the
+ * {@link HLLType#EXPLICIT} representation is not used. Both this and
+ * <code>explicitAuto</code> cannot be <code>true</code> at the same
+ * time.
+ * @param explicitAuto the flag for 'explicit auto'-mode, where the
+ * {@link HLLType#EXPLICIT} representation's promotion cutoff is
+ * determined based on in-memory size automatically. Both this and
+ * <code>explicitOff</code> cannot be <code>true</code> at the same
+ * time.
+ * @param sparseEnabled the flag for 'sparse-enabled'-mode, where the
+ * {@link HLLType#SPARSE} representation is used.
+ */
+ public HLLMetadata(final int schemaVersion,
+ final HLLType type,
+ final int registerCountLog2,
+ final int registerWidth,
+ final int log2ExplicitCutoff,
+ final boolean explicitOff,
+ final boolean explicitAuto,
+ final boolean sparseEnabled) {
+ this.schemaVersion = schemaVersion;
+ this.type = type;
+ this.registerCountLog2 = registerCountLog2;
+ this.registerWidth = registerWidth;
+ this.log2ExplicitCutoff = log2ExplicitCutoff;
+ this.explicitOff = explicitOff;
+ this.explicitAuto = explicitAuto;
+ this.sparseEnabled = sparseEnabled;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#schemaVersion()
+ */
+ @Override
+ public int schemaVersion() { return schemaVersion; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#HLLType()
+ */
+ @Override
+ public HLLType HLLType() { return type; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#registerCountLog2()
+ */
+ @Override
+ public int registerCountLog2() { return registerCountLog2; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#registerWidth()
+ */
+ @Override
+ public int registerWidth() { return registerWidth; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+ */
+ @Override
+ public int log2ExplicitCutoff() { return log2ExplicitCutoff; }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#explicitOff()
+ */
+ @Override
+ public boolean explicitOff() {
+ return explicitOff;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#explicitAuto()
+ * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
+ */
+ @Override
+ public boolean explicitAuto() {
+ return explicitAuto;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.IHLLMetadata#sparseEnabled()
+ */
+ @Override
+ public boolean sparseEnabled() { return sparseEnabled; }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return "<HLLMetadata schemaVersion: " + this.schemaVersion + ", type: " + this.type.toString() + ", registerCountLog2: " + this.registerCountLog2 + ", registerWidth: " + this.registerWidth + ", log2ExplicitCutoff: " + this.log2ExplicitCutoff + ", explicitOff: " + this.explicitOff + ", explicitAuto: " +this.explicitAuto + ">";
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/HLLType.java b/solr/core/src/java/org/apache/solr/util/hll/HLLType.java
new file mode 100644
index 0000000..e08fb14
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/HLLType.java
@@ -0,0 +1,29 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The types of algorithm/data structure that {@link HLL} can utilize. For more
+ * information, see the Javadoc for {@link HLL}.
+ */
+public enum HLLType {
+ EMPTY,
+ EXPLICIT,
+ SPARSE,
+ FULL;
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java b/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java
new file mode 100644
index 0000000..35aa86e
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/HLLUtil.java
@@ -0,0 +1,199 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Static functions for computing constants and parameters used in the HLL
+ * algorithm.
+ */
+final class HLLUtil {
+ /**
+ * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
+ * Calculated with this formula:
+ * <pre>
+ * int maxRegisterValue = (1 << registerSizeInBits) - 1;
+ * // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
+ * return ~((1L << (maxRegisterValue - 1)) - 1);
+ * </pre>
+ *
+ * @see #pwMaxMask(int)
+ */
+ private static final long[] PW_MASK = {
+ ~((1L << (((1 << 0) - 1) - 1)) - 1),
+ ~((1L << (((1 << 1) - 1) - 1)) - 1),
+ ~((1L << (((1 << 2) - 1) - 1)) - 1),
+ ~((1L << (((1 << 3) - 1) - 1)) - 1),
+ ~((1L << (((1 << 4) - 1) - 1)) - 1),
+ ~((1L << (((1 << 5) - 1) - 1)) - 1),
+ ~((1L << (((1 << 6) - 1) - 1)) - 1),
+ ~((1L << (((1 << 7) - 1) - 1)) - 1),
+ ~((1L << (((1 << 8) - 1) - 1)) - 1)
+ };
+
+ /**
+ * Precomputed <code>twoToL</code> values indexed by a linear combination of
+ * <code>regWidth</code> and <code>log2m</code>.
+ *
+ * The array is one-dimensional and can be accessed by using index
+ * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
+ * for <code>regWidth</code> and <code>log2m</code> between the specified
+ * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
+ *
+ * @see #largeEstimator(int, int, double)
+ * @see #largeEstimatorCutoff(int, int)
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
+ */
+ private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];
+
+ /**
+ * Spacing constant used to compute offsets into {@link #TWO_TO_L}.
+ */
+ private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;
+
+ static {
+ for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
+ for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
+ int maxRegisterValue = (1 << regWidth) - 1;
+
+ // Since 1 is added to p(w) in the insertion algorithm, only
+ // (maxRegisterValue - 1) bits are inspected hence the hash
+ // space is one power of two smaller.
+ final int pwBits = (maxRegisterValue - 1);
+ final int totalBits = (pwBits + log2m);
+ final double twoToL = Math.pow(2, totalBits);
+ TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
+ }
+ }
+ }
+
+ // ************************************************************************
+ /**
+ * Computes the bit-width of HLL registers necessary to estimate a set of
+ * the specified cardinality.
+ *
+ * @param expectedUniqueElements an upper bound on the number of unique
+ * elements that are expected. This must be greater than zero.
+ * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
+ */
+ public static int registerBitSize(final long expectedUniqueElements) {
+ return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
+ (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
+ }
+
+ // ========================================================================
+ /**
+ * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
+ *
+ * @param m this must be a power of two, cannot be less than
+ * 16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
+ * @return gamma times <code>registerCount</code> squared where gamma is
+ * based on the value of <code>registerCount</code>.
+ * @throws IllegalArgumentException if <code>registerCount</code> is less
+ * than 16.
+ */
+ public static double alphaMSquared(final int m) {
+ switch(m) {
+ case 1/*2^0*/:
+ case 2/*2^1*/:
+ case 4/*2^2*/:
+ case 8/*2^3*/:
+ throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");
+
+ case 16/*2^4*/:
+ return 0.673 * m * m;
+
+ case 32/*2^5*/:
+ return 0.697 * m * m;
+
+ case 64/*2^6*/:
+ return 0.709 * m * m;
+
+ default/*>2^6*/:
+ return (0.7213 / (1.0 + 1.079 / m)) * m * m;
+ }
+ }
+
+ // ========================================================================
+ /**
+ * Computes a mask that prevents overflow of HyperLogLog registers.
+ *
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @return mask a <code>long</code> mask to prevent overflow of the registers
+ * @see #registerBitSize(long)
+ */
+ public static long pwMaxMask(final int registerSizeInBits) {
+ return PW_MASK[registerSizeInBits];
+ }
+
+ // ========================================================================
+ /**
+ * The cutoff for using the "small range correction" formula, in the
+ * HyperLogLog algorithm.
+ *
+ * @param m the number of registers in the HLL. <em>m<em> in the paper.
+ * @return the cutoff for the small range correction.
+ * @see #smallEstimator(int, int)
+ */
+ public static double smallEstimatorCutoff(final int m) {
+ return ((double)m * 5) / 2;
+ }
+
+ /**
+ * The "small range correction" formula from the HyperLogLog algorithm. Only
+ * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
+ * there are still registers that have the zero value.
+ *
+ * @param m the number of registers in the HLL. <em>m<em> in the paper.
+ * @param numberOfZeroes the number of registers with value zero. <em>V</em>
+ * in the paper.
+ * @return a corrected cardinality estimate.
+ */
+ public static double smallEstimator(final int m, final int numberOfZeroes) {
+ return m * Math.log((double)m / numberOfZeroes);
+ }
+
+ /**
+ * The cutoff for using the "large range correction" formula, from the
+ * HyperLogLog algorithm, adapted for 64 bit hashes.
+ *
+ * @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @return the cutoff for the large range correction.
+ * @see #largeEstimator(int, int, double)
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
+ */
+ public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
+ return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
+ }
+
+ /**
+ * The "large range correction" formula from the HyperLogLog algorithm, adapted
+ * for 64 bit hashes. Only appropriate for estimators whose value exceeds
+ * the return of {@link #largeEstimatorCutoff(int, int)}.
+ *
+ * @param log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
+ * @param registerSizeInBits the size of the HLL registers, in bits.
+ * @param estimator the original estimator ("E" in the paper).
+ * @return a corrected cardinality estimate.
+ * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
+ */
+ public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
+ final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
+ return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java b/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java
new file mode 100644
index 0000000..79b294e
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/IHLLMetadata.java
@@ -0,0 +1,71 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The metadata and parameters associated with a HLL.
+ */
+interface IHLLMetadata {
+ /**
+ * @return the schema version of the HLL. This will never be <code>null</code>.
+ */
+ int schemaVersion();
+
+ /**
+ * @return the type of the HLL. This will never be <code>null</code>.
+ */
+ HLLType HLLType();
+
+ /**
+ * @return the log-base-2 of the register count parameter of the HLL. This
+ * will always be greater than or equal to 4 and less than or equal
+ * to 31.
+ */
+ int registerCountLog2();
+
+ /**
+ * @return the register width parameter of the HLL. This will always be
+ * greater than or equal to 1 and less than or equal to 8.
+ */
+ int registerWidth();
+
+ /**
+ * @return the log-base-2 of the explicit cutoff cardinality. This will always
+ * be greater than or equal to zero and less than 31, per the specification.
+ */
+ int log2ExplicitCutoff();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+ * has been disabled. <code>false</code> otherwise.
+ */
+ boolean explicitOff();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
+ * cutoff cardinality is set to be automatically chosen,
+ * <code>false</code> otherwise.
+ */
+ boolean explicitAuto();
+
+ /**
+ * @return <code>true</code> if the {@link HLLType#SPARSE} representation
+ * is enabled.
+ */
+ boolean sparseEnabled();
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java b/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java
new file mode 100644
index 0000000..c364afa
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/ISchemaVersion.java
@@ -0,0 +1,85 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A serialization schema for HLLs. Reads and writes HLL metadata to
+ * and from <code>byte[]</code> representations.
+ */
+interface ISchemaVersion {
+ /**
+ * The number of metadata bytes required for a serialized HLL of the
+ * specified type.
+ *
+ * @param type the type of the serialized HLL
+ * @return the number of padding bytes needed in order to fully accommodate
+ * the needed metadata.
+ */
+ int paddingBytes(HLLType type);
+
+ /**
+ * Writes metadata bytes to serialized HLL.
+ *
+ * @param bytes the padded data bytes of the HLL
+ * @param metadata the metadata to write to the padding bytes
+ */
+ void writeMetadata(byte[] bytes, IHLLMetadata metadata);
+
+ /**
+ * Reads the metadata bytes of the serialized HLL.
+ *
+ * @param bytes the serialized HLL
+ * @return the HLL metadata
+ */
+ IHLLMetadata readMetadata(byte[] bytes);
+
+ /**
+ * Builds an HLL serializer that matches this schema version.
+ *
+ * @param type the HLL type that will be serialized. This cannot be
+ * <code>null</code>.
+ * @param wordLength the length of the 'words' that comprise the data of the
+ * HLL. Words must be at least 5 bits and at most 64 bits long.
+ * @param wordCount the number of 'words' in the HLL's data.
+ * @return a byte array serializer used to serialize a HLL according
+ * to this schema version's specification.
+ * @see #paddingBytes(HLLType)
+ * @see IWordSerializer
+ */
+ IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount);
+
+ /**
+ * Builds an HLL deserializer that matches this schema version.
+ *
+ * @param type the HLL type that will be deserialized. This cannot be
+ * <code>null</code>.
+ * @param wordLength the length of the 'words' that comprise the data of the
+ * serialized HLL. Words must be at least 5 bits and at most 64
+ * bits long.
+ * @param bytes the serialized HLL to deserialize. This cannot be
+ * <code>null</code>.
+ * @return a byte array deserializer used to deserialize a HLL serialized
+ * according to this schema version's specification.
+ */
+ IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes);
+
+ /**
+ * @return the schema version number.
+ */
+ int schemaVersionNumber();
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java b/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java
new file mode 100644
index 0000000..7e37425
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/IWordDeserializer.java
@@ -0,0 +1,41 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Reads 'words' of a fixed width, in sequence, from a byte array.
+ */
+public interface IWordDeserializer {
+ /**
+ * @return the next word in the sequence. Should not be called more than
+ * {@link #totalWordCount()} times.
+ */
+ long readWord();
+
+ /**
+ * Returns the number of words that could be encoded in the sequence.
+ *
+ * NOTE: the sequence that was encoded may be shorter than the value this
+ * method returns due to padding issues within bytes. This guarantees
+ * only an upper bound on the number of times {@link #readWord()}
+ * can be called.
+ *
+ * @return the maximum number of words that could be read from the sequence.
+ */
+ int totalWordCount();
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java b/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java
new file mode 100644
index 0000000..10f75df
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/IWordSerializer.java
@@ -0,0 +1,39 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Writes 'words' of fixed width, in sequence, to a byte array.
+ */
+interface IWordSerializer {
+
+ /**
+ * Writes the word to the backing array.
+ *
+ * @param word the word to write.
+ */
+ void writeWord(final long word);
+
+ /**
+ * Returns the backing array of <code>byte</code>s that contain the serialized
+ * words.
+ * @return the serialized words as a <code>byte[]</code>.
+ */
+ byte[] getBytes();
+
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java b/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java
new file mode 100644
index 0000000..4ad72b5
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/LongIterator.java
@@ -0,0 +1,35 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A <code>long</code>-based iterator. This is not <i>is-a</i> {@link java.util.Iterator}
+ * to prevent autoboxing between <code>Long</code> and <code>long</code>.
+ */
+interface LongIterator {
+ /**
+ * @return <code>true</code> if and only if there are more elements to
+ * iterate over. <code>false</code> otherwise.
+ */
+ boolean hasNext();
+
+ /**
+ * @return the next <code>long</code> in the collection.
+ */
+ long next();
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java b/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java
new file mode 100644
index 0000000..1c5c184
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/NumberUtil.java
@@ -0,0 +1,172 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of utilities to work with numbers.
+ */
+class NumberUtil {
+ // loge(2) (log-base e of 2)
+ public static final double LOGE_2 = 0.6931471805599453;
+
+ // ************************************************************************
+ /**
+ * Computes the <code>log2</code> (log-base-two) of the specified value.
+ *
+ * @param value the <code>double</code> for which the <code>log2</code> is
+ * desired.
+ * @return the <code>log2</code> of the specified value
+ */
+ public static double log2(final double value) {
+ // REF: http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
+ return Math.log(value) / LOGE_2;
+ }
+
+ // ========================================================================
+ // the hex characters
+ private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+
+ // ------------------------------------------------------------------------
+ /**
+ * Converts the specified array of <code>byte</code>s into a string of
+ * hex characters (low <code>byte</code> first).
+ *
+ * @param bytes the array of <code>byte</code>s that are to be converted.
+ * This cannot be <code>null</code> though it may be empty.
+ * @param offset the offset in <code>bytes</code> at which the bytes will
+ * be taken. This cannot be negative and must be less than
+ * <code>bytes.length - 1</code>.
+ * @param count the number of bytes to be retrieved from the specified array.
+ * This cannot be negative. If greater than <code>bytes.length - offset</code>
+ * then that value is used.
+ * @return a string of at most <code>count</code> characters that represents
+ * the specified byte array in hex. This will never be <code>null</code>
+ * though it may be empty if <code>bytes</code> is empty or <code>count</code>
+ * is zero.
+ * @throws IllegalArgumentException if <code>offset</code> is greater than
+ * or equal to <code>bytes.length</code>.
+ * @see #fromHex(String, int, int)
+ */
+ public static String toHex(final byte[] bytes, final int offset, final int count) {
+ if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/;
+ final int byteCount = Math.min( (bytes.length - offset), count);
+ final int upperBound = byteCount + offset;
+
+ final char[] chars = new char[byteCount * 2/*two chars per byte*/];
+ int charIndex = 0;
+ for(int i=offset; i<upperBound; i++) {
+ final byte value = bytes[i];
+ chars[charIndex++] = HEX[(value >>> 4) & 0x0F];
+ chars[charIndex++] = HEX[value & 0x0F];
+ }
+
+ return new String(chars);
+ }
+
+ /**
+ * Converts the specified array of hex characters into an array of <code>byte</code>s
+ * (low <code>byte</code> first).
+ *
+ * @param string the string of hex characters to be converted into <code>byte</code>s.
+ * This cannot be <code>null</code> though it may be blank.
+ * @param offset the offset in the string at which the characters will be
+ * taken. This cannot be negative and must be less than <code>string.length() - 1</code>.
+ * @param count the number of characters to be retrieved from the specified
+ * string. This cannot be negative and must be divisible by two
+ * (since there are two characters per <code>byte</code>).
+ * @return the array of <code>byte</code>s that were converted from the
+ * specified string (in the specified range). This will never be
+ * <code>null</code> though it may be empty if <code>string</code>
+ * is empty or <code>count</code> is zero.
+ * @throws IllegalArgumentException if <code>offset</code> is greater than
+ * or equal to <code>string.length()</code> or if <code>count</code>
+ * is not divisible by two.
+ * @see #toHex(byte[], int, int)
+ */
+ public static byte[] fromHex(final String string, final int offset, final int count) {
+ if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/;
+ if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/;
+ final int charCount = Math.min((string.length() - offset), count);
+ final int upperBound = offset + charCount;
+
+ final byte[] bytes = new byte[charCount >>> 1/*aka /2*/];
+ int byteIndex = 0/*beginning*/;
+ for(int i=offset; i<upperBound; i+=2) {
+ bytes[byteIndex++] = (byte)(( (digit(string.charAt(i)) << 4)
+ | digit(string.charAt(i + 1))) & 0xFF);
+ }
+
+ return bytes;
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * @param character a hex character to be converted to a <code>byte</code>.
+ * This cannot be a character other than [a-fA-F0-9].
+ * @return the value of the specified character. This will be a value <code>0</code>
+ * through <code>15</code>.
+ * @throws IllegalArgumentException if the specified character is not in
+ * [a-fA-F0-9]
+ */
+ private static final int digit(final char character) {
+ switch(character) {
+ case '0':
+ return 0;
+ case '1':
+ return 1;
+ case '2':
+ return 2;
+ case '3':
+ return 3;
+ case '4':
+ return 4;
+ case '5':
+ return 5;
+ case '6':
+ return 6;
+ case '7':
+ return 7;
+ case '8':
+ return 8;
+ case '9':
+ return 9;
+ case 'a':
+ case 'A':
+ return 10;
+ case 'b':
+ case 'B':
+ return 11;
+ case 'c':
+ case 'C':
+ return 12;
+ case 'd':
+ case 'D':
+ return 13;
+ case 'e':
+ case 'E':
+ return 14;
+ case 'f':
+ case 'F':
+ return 15;
+
+ default:
+ throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "').");
+ }
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java b/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java
new file mode 100644
index 0000000..e73c0cf
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/SchemaVersionOne.java
@@ -0,0 +1,154 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A concrete {@link ISchemaVersion} representing schema version one.
+ */
+class SchemaVersionOne implements ISchemaVersion {
+ /**
+ * The schema version number for this instance.
+ */
+ public static final int SCHEMA_VERSION = 1;
+
+ // ------------------------------------------------------------------------
+ // Version-specific ordinals (array position) for each of the HLL types
+ private static final HLLType[] TYPE_ORDINALS = new HLLType[] {
+ HLLType.EMPTY,
+ HLLType.EXPLICIT,
+ HLLType.SPARSE,
+ HLLType.FULL
+ };
+
+ // ------------------------------------------------------------------------
+ // number of header bytes for all HLL types
+ private static final int HEADER_BYTE_COUNT = 3;
+
+ // sentinel values from the spec for explicit off and auto
+ private static final int EXPLICIT_OFF = 0;
+ private static final int EXPLICIT_AUTO = 63;
+
+ // ************************************************************************
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#paddingBytes(HLLType)
+ */
+ @Override
+ public int paddingBytes(final HLLType type) {
+ return HEADER_BYTE_COUNT;
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#writeMetadata(byte[], IHLLMetadata)
+ */
+ @Override
+ public void writeMetadata(final byte[] bytes, final IHLLMetadata metadata) {
+ final HLLType type = metadata.HLLType();
+ final int typeOrdinal = getOrdinal(type);
+
+ final int explicitCutoffValue;
+ if(metadata.explicitOff()) {
+ explicitCutoffValue = EXPLICIT_OFF;
+ } else if(metadata.explicitAuto()) {
+ explicitCutoffValue = EXPLICIT_AUTO;
+ } else {
+ explicitCutoffValue = metadata.log2ExplicitCutoff() + 1/*per spec*/;
+ }
+
+ bytes[0] = SerializationUtil.packVersionByte(SCHEMA_VERSION, typeOrdinal);
+ bytes[1] = SerializationUtil.packParametersByte(metadata.registerWidth(), metadata.registerCountLog2());
+ bytes[2] = SerializationUtil.packCutoffByte(explicitCutoffValue, metadata.sparseEnabled());
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#readMetadata(byte[])
+ */
+ @Override
+ public IHLLMetadata readMetadata(final byte[] bytes) {
+ final byte versionByte = bytes[0];
+ final byte parametersByte = bytes[1];
+ final byte cutoffByte = bytes[2];
+
+ final int typeOrdinal = SerializationUtil.typeOrdinal(versionByte);
+ final int explicitCutoffValue = SerializationUtil.explicitCutoff(cutoffByte);
+ final boolean explicitOff = (explicitCutoffValue == EXPLICIT_OFF);
+ final boolean explicitAuto = (explicitCutoffValue == EXPLICIT_AUTO);
+ final int log2ExplicitCutoff = (explicitOff || explicitAuto) ? -1/*sentinel*/ : (explicitCutoffValue - 1/*per spec*/);
+
+ return new HLLMetadata(SCHEMA_VERSION,
+ getType(typeOrdinal),
+ SerializationUtil.registerCountLog2(parametersByte),
+ SerializationUtil.registerWidth(parametersByte),
+ log2ExplicitCutoff,
+ explicitOff,
+ explicitAuto,
+ SerializationUtil.sparseEnabled(cutoffByte));
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#getSerializer(HLLType, int, int)
+ */
+ @Override
+ public IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount) {
+ return new BigEndianAscendingWordSerializer(wordLength, wordCount, paddingBytes(type));
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#getDeserializer(HLLType, int, byte[])
+ */
+ @Override
+ public IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes) {
+ return new BigEndianAscendingWordDeserializer(wordLength, paddingBytes(type), bytes);
+ }
+
+ /* (non-Javadoc)
+ * @see net.agkn.hll.serialization.ISchemaVersion#schemaVersionNumber()
+ */
+ @Override
+ public int schemaVersionNumber() {
+ return SCHEMA_VERSION;
+ }
+
+ // ========================================================================
+ // Type/Ordinal lookups
+ /**
+ * Gets the ordinal for the specified {@link HLLType}.
+ *
+ * @param type the type whose ordinal is desired
+ * @return the ordinal for the specified type, to be used in the version byte.
+ * This will always be non-negative.
+ */
+ private static int getOrdinal(final HLLType type) {
+ for(int i=0; i<TYPE_ORDINALS.length; i++) {
+ if(TYPE_ORDINALS[i].equals(type)) return i;
+ }
+ throw new RuntimeException("Unknown HLL type " + type);
+ }
+
+ /**
+ * Gets the {@link HLLType} for the specified ordinal.
+ *
+ * @param ordinal the ordinal whose type is desired
+ * @return the type for the specified ordinal. This will never be <code>null</code>.
+ */
+ private static HLLType getType(final int ordinal) {
+ if((ordinal < 0) || (ordinal >= TYPE_ORDINALS.length)) {
+ throw new IllegalArgumentException("Invalid type ordinal '" + ordinal + "'. Only 0-" + (TYPE_ORDINALS.length - 1) + " inclusive allowed.");
+ }
+ return TYPE_ORDINALS[ordinal];
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java b/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java
new file mode 100644
index 0000000..bcf55a3
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/SerializationUtil.java
@@ -0,0 +1,277 @@
+package org.apache.solr.util.hll;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A collection of constants and utilities for serializing and deserializing
+ * HLLs.
+ *
+ * NOTE: 'package' visibility is used for many methods that only need to be
+ * used by the {@link ISchemaVersion} implementations. The structure of
+ * a serialized HLL's metadata should be opaque to the rest of the
+ * library.
+ */
+class SerializationUtil {
+ /**
+ * The number of bits (of the parameters byte) dedicated to encoding the
+ * width of the registers.
+ */
+ /*package*/ static int REGISTER_WIDTH_BITS = 3;
+
+ /**
+ * A mask to cap the maximum value of the register width.
+ */
+ /*package*/ static int REGISTER_WIDTH_MASK = (1 << REGISTER_WIDTH_BITS) - 1;
+
+ /**
+ * The number of bits (of the parameters byte) dedicated to encoding
+ * <code>log2(registerCount)</code>.
+ */
+ /*package*/ static int LOG2_REGISTER_COUNT_BITS = 5;
+
+ /**
+ * A mask to cap the maximum value of <code>log2(registerCount)</code>.
+ */
+ /*package*/ static int LOG2_REGISTER_COUNT_MASK = (1 << LOG2_REGISTER_COUNT_BITS) - 1;
+
+ /**
+ * The number of bits (of the cutoff byte) dedicated to encoding the
+ * log-base-2 of the explicit cutoff or sentinel values for
+ * 'explicit-disabled' or 'auto'.
+ */
+ /*package*/ static int EXPLICIT_CUTOFF_BITS = 6;
+
+ /**
+ * A mask to cap the maximum value of the explicit cutoff choice.
+ */
+ /*package*/ static int EXPLICIT_CUTOFF_MASK = (1 << EXPLICIT_CUTOFF_BITS) - 1;
+
+ /**
+ * Number of bits in a nibble.
+ */
+ private static int NIBBLE_BITS = 4;
+
+ /**
+ * A mask to cap the maximum value of a nibble.
+ */
+ private static int NIBBLE_MASK = (1 << NIBBLE_BITS) - 1;
+
+ // ************************************************************************
+ // Serialization utilities
+
+ /**
+ * Schema version one (v1).
+ */
+ public static ISchemaVersion VERSION_ONE = new SchemaVersionOne();
+
+ /**
+ * The default schema version for serializing HLLs.
+ */
+ public static ISchemaVersion DEFAULT_SCHEMA_VERSION = VERSION_ONE;
+
+ /**
+ * List of registered schema versions, indexed by their version numbers. If
+ * an entry is <code>null</code>, then no such schema version is registered.
+ * Similarly, registering a new schema version simply entails assigning an
+ * {@link ISchemaVersion} instance to the appropriate index of this array.<p/>
+ *
+ * By default, only {@link SchemaVersionOne} is registered. Note that version
+ * zero will always be reserved for internal (e.g. proprietary, legacy) schema
+ * specifications/implementations and will never be assigned to in by this
+ * library.
+ */
+ public static ISchemaVersion[] REGISTERED_SCHEMA_VERSIONS = new ISchemaVersion[16];
+
+ static {
+ REGISTERED_SCHEMA_VERSIONS[1] = VERSION_ONE;
+ }
+
+ /**
+ * @param schemaVersionNumber the version number of the {@link ISchemaVersion}
+ * desired. This must be a registered schema version number.
+ * @return The {@link ISchemaVersion} for the given number. This will never
+ * be <code>null</code>.
+ */
+ public static ISchemaVersion getSchemaVersion(final int schemaVersionNumber) {
+ if(schemaVersionNumber >= REGISTERED_SCHEMA_VERSIONS.length || schemaVersionNumber < 0) {
+ throw new RuntimeException("Invalid schema version number " + schemaVersionNumber);
+ }
+ final ISchemaVersion schemaVersion = REGISTERED_SCHEMA_VERSIONS[schemaVersionNumber];
+ if(schemaVersion == null) {
+ throw new RuntimeException("Unknown schema version number " + schemaVersionNumber);
+ }
+ return schemaVersion;
+ }
+
+ /**
+ * Get the appropriate {@link ISchemaVersion schema version} for the specified
+ * serialized HLL.
+ *
+ * @param bytes the serialized HLL whose schema version is desired.
+ * @return the schema version for the specified HLL. This will never
+ * be <code>null</code>.
+ */
+ public static ISchemaVersion getSchemaVersion(final byte[] bytes) {
+ final byte versionByte = bytes[0];
+ final int schemaVersionNumber = schemaVersion(versionByte);
+
+ return getSchemaVersion(schemaVersionNumber);
+ }
+
+ // ************************************************************************
+ // Package-specific shared helpers
+
+ /**
+ * Generates a byte that encodes the schema version and the type ordinal
+ * of the HLL.
+ *
+ * The top nibble is the schema version and the bottom nibble is the type
+ * ordinal.
+ *
+ * @param schemaVersion the schema version to encode.
+ * @param typeOrdinal the type ordinal of the HLL to encode.
+ * @return the packed version byte
+ */
+ public static byte packVersionByte(final int schemaVersion, final int typeOrdinal) {
+ return (byte)(((NIBBLE_MASK & schemaVersion) << NIBBLE_BITS) | (NIBBLE_MASK & typeOrdinal));
+ }
+ /**
+ * Generates a byte that encodes the log-base-2 of the explicit cutoff
+ * or sentinel values for 'explicit-disabled' or 'auto', as well as the
+ * boolean indicating whether to use {@link HLLType#SPARSE}
+ * in the promotion hierarchy.
+ *
+ * The top bit is always padding, the second highest bit indicates the
+ * 'sparse-enabled' boolean, and the lowest six bits encode the explicit
+ * cutoff value.
+ *
+ * @param explicitCutoff the explicit cutoff value to encode.
+ * <ul>
+ * <li>
+ * If 'explicit-disabled' is chosen, this value should be <code>0</code>.
+ * </li>
+ * <li>
+ * If 'auto' is chosen, this value should be <code>63</code>.
+ * </li>
+ * <li>
+ * If a cutoff of 2<sup>n</sup> is desired, for <code>0 <= n < 31</code>,
+ * this value should be <code>n + 1</code>.
+ * </li>
+ * </ul>
+ * @param sparseEnabled whether {@link HLLType#SPARSE}
+ * should be used in the promotion hierarchy to improve HLL
+ * storage.
+ *
+ * @return the packed cutoff byte
+ */
+ public static byte packCutoffByte(final int explicitCutoff, final boolean sparseEnabled) {
+ final int sparseBit = (sparseEnabled ? (1 << EXPLICIT_CUTOFF_BITS) : 0);
+ return (byte)(sparseBit | (EXPLICIT_CUTOFF_MASK & explicitCutoff));
+ }
+
+ /**
+ * Generates a byte that encodes the parameters of a
+ * {@link HLLType#FULL} or {@link HLLType#SPARSE}
+ * HLL.<p/>
+ *
+ * The top 3 bits are used to encode <code>registerWidth - 1</code>
+ * (range of <code>registerWidth</code> is thus 1-9) and the bottom 5
+ * bits are used to encode <code>registerCountLog2</code>
+ * (range of <code>registerCountLog2</code> is thus 0-31).
+ *
+ * @param registerWidth the register width (must be at least 1 and at
+ * most 9)
+ * @param registerCountLog2 the log-base-2 of the register count (must
+ * be at least 0 and at most 31)
+ * @return the packed parameters byte
+ */
+ public static byte packParametersByte(final int registerWidth, final int registerCountLog2) {
+ final int widthBits = ((registerWidth - 1) & REGISTER_WIDTH_MASK);
+ final int countBits = (registerCountLog2 & LOG2_REGISTER_COUNT_MASK);
+ return (byte)((widthBits << LOG2_REGISTER_COUNT_BITS) | countBits);
+ }
+
+ /**
+ * Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized
+ * HLL.
+ *
+ * @param cutoffByte the cutoff byte of the serialized HLL
+ * @return the 'sparse-enabled' boolean
+ */
+ public static boolean sparseEnabled(final byte cutoffByte) {
+ return ((cutoffByte >>> EXPLICIT_CUTOFF_BITS) & 1) == 1;
+ }
+
+ /**
+ * Extracts the explicit cutoff value from the cutoff byte of a serialized
+ * HLL.
+ *
+ * @param cutoffByte the cutoff byte of the serialized HLL
+ * @return the explicit cutoff value
+ */
+ public static int explicitCutoff(final byte cutoffByte) {
+ return (cutoffByte & EXPLICIT_CUTOFF_MASK);
+ }
+
+ /**
+ * Extracts the schema version from the version byte of a serialized
+ * HLL.
+ *
+ * @param versionByte the version byte of the serialized HLL
+ * @return the schema version of the serialized HLL
+ */
+ public static int schemaVersion(final byte versionByte) {
+ return NIBBLE_MASK & (versionByte >>> NIBBLE_BITS);
+ }
+
+ /**
+ * Extracts the type ordinal from the version byte of a serialized HLL.
+ *
+ * @param versionByte the version byte of the serialized HLL
+ * @return the type ordinal of the serialized HLL
+ */
+ public static int typeOrdinal(final byte versionByte) {
+ return (versionByte & NIBBLE_MASK);
+ }
+
+ /**
+ * Extracts the register width from the parameters byte of a serialized
+ * {@link HLLType#FULL} HLL.
+ *
+ * @param parametersByte the parameters byte of the serialized HLL
+ * @return the register width of the serialized HLL
+ *
+ * @see #packParametersByte(int, int)
+ */
+ public static int registerWidth(final byte parametersByte) {
+ return ((parametersByte >>> LOG2_REGISTER_COUNT_BITS) & REGISTER_WIDTH_MASK) + 1;
+ }
+
+ /**
+ * Extracts the log2(registerCount) from the parameters byte of a
+ * serialized {@link HLLType#FULL} HLL.
+ *
+ * @param parametersByte the parameters byte of the serialized HLL
+ * @return log2(registerCount) of the serialized HLL
+ *
+ * @see #packParametersByte(int, int)
+ */
+ public static int registerCountLog2(final byte parametersByte) {
+ return (parametersByte & LOG2_REGISTER_COUNT_MASK);
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/hll/package-info.java b/solr/core/src/java/org/apache/solr/util/hll/package-info.java
new file mode 100644
index 0000000..040b84e
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/hll/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A fork of <a href="https://github.com/aggregateknowledge/java-hll/">Java-HyperLogLog</a> package tweaked
+ * not to depend on fastutil and with cleanups to make it lean and clean.
+ */
+package org.apache.solr.util.hll;
+
+
diff --git a/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
index 3a78c6a..34fabee 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/StatsComponentTest.java
@@ -55,9 +55,9 @@
import org.apache.commons.math3.util.Combinations;
import com.tdunning.math.stats.AVLTreeDigest;
-import net.agkn.hll.HLL;
import com.google.common.hash.Hashing;
-import com.google.common.hash.HashFunction;
+import com.google.common.hash.HashFunction;
+import org.apache.solr.util.hll.HLL;
import org.junit.BeforeClass;
diff --git a/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java b/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
index 7cceea0..d3e20f0 100644
--- a/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
+++ b/solr/core/src/test/org/apache/solr/handler/component/TestDistributedStatsComponentCardinality.java
@@ -31,7 +31,7 @@
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index 44f314a..b7d954d 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -28,7 +28,7 @@
import java.util.Random;
import com.tdunning.math.stats.AVLTreeDigest;
-import net.agkn.hll.HLL;
+import org.apache.solr.util.hll.HLL;
import org.apache.lucene.queryparser.flexible.standard.processors.NumericQueryNodeProcessor;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.packed.GrowableWriter;
diff --git a/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordDeserializerTest.java b/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordDeserializerTest.java
new file mode 100644
index 0000000..dab2937
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordDeserializerTest.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import java.util.Random;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+
+/**
+ * Unit and smoke tests for {@link BigEndianAscendingWordDeserializer}.
+ */
+public class BigEndianAscendingWordDeserializerTest extends LuceneTestCase {
+ /**
+ * Error checking tests for constructor.
+ */
+ @Test
+ public void constructorErrorTest() {
+ // word length too small
+ try {
+ new BigEndianAscendingWordDeserializer(0/*wordLength, below minimum of 1*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
+ fail("Should complain about too-short words.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Word length must be"));
+ }
+
+ // word length too large
+ try {
+ new BigEndianAscendingWordDeserializer(65/*wordLength, above maximum of 64*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
+ fail("Should complain about too-long words.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Word length must be"));
+ }
+
+ // byte padding negative
+ try {
+ new BigEndianAscendingWordDeserializer(5/*wordLength, arbitrary*/, -1/*bytePadding, too small*/, new byte[1]/*bytes, arbitrary, not used here*/);
+ fail("Should complain about negative byte padding.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Byte padding must be"));
+ }
+ }
+
+ /**
+ * Smoke test using 64-bit short words and special word values.
+ */
+ @Test
+ public void smokeTest64BitWord() {
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(64/*wordLength*/,
+ 5/*wordCount*/,
+ 0/*bytePadding, arbitrary*/);
+
+ // Check that the sign bit is being preserved.
+ serializer.writeWord(-1L);
+ serializer.writeWord(-112894714L);
+
+ // Check "special" values
+ serializer.writeWord(0L);
+ serializer.writeWord(Long.MAX_VALUE);
+ serializer.writeWord(Long.MIN_VALUE);
+
+ final byte[] bytes = serializer.getBytes();
+
+ final BigEndianAscendingWordDeserializer deserializer =
+ new BigEndianAscendingWordDeserializer(64/*wordLength*/, 0/*bytePadding*/, bytes);
+
+ assertEquals(deserializer.totalWordCount(), 5/*wordCount*/);
+
+ assertEquals(deserializer.readWord(), -1L);
+ assertEquals(deserializer.readWord(), -112894714L);
+ assertEquals(deserializer.readWord(), 0L);
+ assertEquals(deserializer.readWord(), Long.MAX_VALUE);
+ assertEquals(deserializer.readWord(), Long.MIN_VALUE);
+ }
+
+ /**
+ * A smoke/fuzz test for ascending (from zero) word values.
+ */
+ @Test
+ public void ascendingSmokeTest() {
+ for(int wordLength=5; wordLength<65; wordLength++) {
+ runAscendingTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
+ }
+ }
+
+ /**
+ * A smoke/fuzz test for random word values.
+ */
+ @Test
+ public void randomSmokeTest() {
+ for(int wordLength=5; wordLength<65; wordLength++) {
+ runRandomTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Runs a test which serializes and deserializes random word values.
+ *
+ * @param wordLength the length of words to test
+ * @param bytePadding the number of bytes padding the byte array
+ * @param wordCount the number of word values to test
+ */
+ private static void runRandomTest(final int wordLength, final int bytePadding, final int wordCount) {
+ final long seed = randomLong();
+ final Random random = new Random(seed);
+ final Random verificationRandom = new Random(seed);
+
+ final long wordMask;
+ if(wordLength == 64) {
+ wordMask = ~0L;
+ } else {
+ wordMask = (1L << wordLength) - 1L;
+ }
+
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
+ wordCount,
+ bytePadding/*bytePadding, arbitrary*/);
+
+ for(int i=0; i<wordCount; i++) {
+ final long value = random.nextLong() & wordMask;
+ serializer.writeWord(value);
+ }
+
+ final byte[] bytes = serializer.getBytes();
+
+ final BigEndianAscendingWordDeserializer deserializer =
+ new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
+
+ assertEquals(deserializer.totalWordCount(), wordCount);
+ for(int i=0; i<wordCount; i++) {
+ assertEquals(deserializer.readWord(), (verificationRandom.nextLong() & wordMask));
+ }
+ }
+
+ /**
+ * Runs a test which serializes and deserializes ascending (from zero) word values.
+ *
+ * @param wordLength the length of words to test
+ * @param bytePadding the number of bytes padding the byte array
+ * @param wordCount the number of word values to test
+ */
+ private static void runAscendingTest(final int wordLength, final int bytePadding, final int wordCount) {
+ final long wordMask;
+ if(wordLength == 64) {
+ wordMask = ~0L;
+ } else {
+ wordMask = (1L << wordLength) - 1L;
+ }
+
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
+ wordCount,
+ bytePadding/*bytePadding, arbitrary*/);
+
+ for(long i=0; i<wordCount; i++) {
+ serializer.writeWord(i & wordMask);
+ }
+
+ final byte[] bytes = serializer.getBytes();
+
+ final BigEndianAscendingWordDeserializer deserializer =
+ new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
+
+ assertEquals(deserializer.totalWordCount(), wordCount);
+ for(long i=0; i<wordCount; i++) {
+ assertEquals(deserializer.readWord(), i & wordMask);
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordSerializerTest.java b/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordSerializerTest.java
new file mode 100644
index 0000000..5cbaa0d
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/BigEndianAscendingWordSerializerTest.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import java.util.Arrays;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link BigEndianAscendingWordSerializer}.
+ */
+public class BigEndianAscendingWordSerializerTest extends LuceneTestCase {
+ /**
+ * Error checking tests for constructor.
+ */
+ @Test
+ public void constructorErrorTest() {
+ // word length too small
+ try {
+ new BigEndianAscendingWordSerializer(0/*wordLength, below minimum of 1*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
+ fail("Should complain about too-short words.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Word length must be"));
+ }
+
+ // word length too large
+ try {
+ new BigEndianAscendingWordSerializer(65/*wordLength, above max of 64*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
+ fail("Should complain about too-long words.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Word length must be"));
+ }
+
+ // word count negative
+ try {
+ new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, -1/*wordCount, too small*/, 0/*bytePadding, arbitrary*/);
+ fail("Should complain about negative word count.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Word count must be"));
+ }
+
+ // byte padding negative
+ try {
+ new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, 1/*wordCount, arbitrary*/, -1/*bytePadding, too small*/);
+ fail("Should complain about negative byte padding.");
+ } catch(final IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("Byte padding must be"));
+ }
+ }
+
+ /**
+ * Tests runtime exception thrown at premature call to {@link BigEndianAscendingWordSerializer#getBytes()}.
+ */
+ @Test
+ public void earlyGetBytesTest() {
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/,
+ 1/*wordCount*/,
+ 0/*bytePadding, arbitrary*/);
+
+ // getBytes without enough writeWord should throw
+ try {
+ serializer.getBytes();
+ fail("Should throw.");
+ } catch(final RuntimeException e) {
+ assertTrue(e.getMessage().contains("Not all words"));
+ }
+ }
+
+ /**
+ */
+ @Test
+ public void smokeTestExplicitParams() {
+ final int shortWordLength = 64/*longs used in LongSetSlab*/;
+
+ {// Should work on an empty sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 0/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ assert(Arrays.equals(serializer.getBytes(), new byte[0]));
+ }
+ {// Should work on a byte-divisible sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 2/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ serializer.writeWord(0xBAAAAAAAAAAAAAACL);
+ serializer.writeWord(0x8FFFFFFFFFFFFFF1L);
+
+ // Bytes:
+ // ======
+ // 0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC
+ // 0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1
+ //
+ // -70 -86 ... -84
+ // -113 -1 ... -15
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { -70, -86, -86, -86, -86, -86, -86, -84,
+ -113, -1, -1, -1, -1, -1, -1, -15 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ {// Should pad the array correctly.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 1/*wordCount*/,
+ 1/*bytePadding*/);
+
+ serializer.writeWord(1);
+ // 1 byte leading padding | value 1 | trailing padding
+ // 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001
+ // 0x00 | 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x01
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ }
+
+ /**
+ * Smoke test for typical parameters used in practice.
+ */
+ @Test
+ public void smokeTestProbabilisticParams() {
+ // XXX: revisit this
+ final int shortWordLength = 5;
+ {// Should work on an empty sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 0/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ assert(Arrays.equals(serializer.getBytes(), new byte[0]));
+ }
+ {// Should work on a non-byte-divisible sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 3/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ serializer.writeWord(9);
+ serializer.writeWord(31);
+ serializer.writeWord(1);
+
+ // The values:
+ // -----------
+ // 9 |31 |1 |padding
+
+ // Corresponding bits:
+ // ------------------
+ // 0100 1|111 11|00 001|0
+
+ // And the hex/decimal (remember Java bytes are signed):
+ // -----------------------------------------------------
+ // 0100 1111 -> 0x4F -> 79
+ // 1100 0010 -> 0xC2 -> -62
+
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 79, -62 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ {// Should work on a byte-divisible sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 8/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ for(int i=1; i<9; i++) {
+ serializer.writeWord(i);
+ }
+
+ // Values: 1-8
+ // Corresponding bits:
+ // ------------------
+ // 00001
+ // 00010
+ // 00011
+ // 00100
+ // 00101
+ // 00110
+ // 00111
+ // 01000
+
+ // And the hex:
+ // ------------
+ // 0000 1000 => 0x08 => 8
+ // 1000 0110 => 0x86 => -122
+ // 0100 0010 => 0x62 => 66
+ // 1001 1000 => 0x98 => -104
+ // 1110 1000 => 0xE8 => -24
+
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ {// Should pad the array correctly.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 1/*wordCount*/,
+ 1/*bytePadding*/);
+
+ serializer.writeWord(1);
+ // 1 byte leading padding | value 1 | trailing padding
+ // 0000 0000 | 0000 1|000
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 0, 8 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ }
+
+ /**
+ * Smoke test for typical parameters used in practice.
+ */
+ @Test
+ public void smokeTestSparseParams() {
+ // XXX: revisit
+ final int shortWordLength = 17;
+ {// Should work on an empty sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 0/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ assert(Arrays.equals(serializer.getBytes(), new byte[0]));
+ }
+ {// Should work on a non-byte-divisible sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 3/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ serializer.writeWord(9);
+ serializer.writeWord(42);
+ serializer.writeWord(75);
+
+ // The values:
+ // -----------
+ // 9 |42 |75 |padding
+
+ // Corresponding bits:
+ // ------------------
+ // 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
+
+ // And the hex/decimal (remember Java bytes are signed):
+ // -----------------------------------------------------
+ // 0000 0000 -> 0x00 -> 0
+ // 0000 0100 -> 0x04 -> 4
+ // 1000 0000 -> 0x80 -> -128
+ // 0000 1010 -> 0x0A -> 10
+ // 1000 0000 -> 0x80 -> -128
+ // 0000 1001 -> 0x09 -> 9
+ // 0110 0000 -> 0x60 -> 96
+
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ {// Should work on a byte-divisible sequence, with no padding.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 8/*wordCount*/,
+ 0/*bytePadding, none*/);
+
+ for(int i=1; i<9; i++) {
+ serializer.writeWord(i);
+ }
+
+ // Values: 1-8
+ // Corresponding bits:
+ // ------------------
+ // 0000 0000 0000 0000 1
+ // 000 0000 0000 0000 10
+ // 00 0000 0000 0000 011
+ // 0 0000 0000 0000 0100
+
+ // 0000 0000 0000 0010 1
+ // 000 0000 0000 0001 10
+ // 00 0000 0000 0000 111
+ // 0 0000 0000 0000 1000
+
+ // And the hex:
+ // ------------
+ // 0000 0000 -> 0x00 -> 0
+ // 0000 0000 -> 0x00 -> 0
+ // 1000 0000 -> 0x80 -> -128
+ // 0000 0000 -> 0x00 -> 0
+ // 1000 0000 -> 0x80 -> -128
+ // 0000 0000 -> 0x00 -> 0
+ // 0110 0000 -> 0x60 -> 96
+ // 0000 0000 -> 0x00 -> 0
+ // 0100 0000 -> 0x40 -> 64
+ // 0000 0000 -> 0x00 -> 0
+ // 0010 1000 -> 0x28 -> 40
+ // 0000 0000 -> 0x00 -> 0
+ // 0001 1000 -> 0x18 -> 24
+ // 0000 0000 -> 0x00 -> 0
+ // 0000 1110 -> 0x0D -> 14
+ // 0000 0000 -> 0x00 -> 0
+ // 0000 1000 -> 0x08 -> 8
+
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ {// Should pad the array correctly.
+ final BigEndianAscendingWordSerializer serializer =
+ new BigEndianAscendingWordSerializer(shortWordLength,
+ 1/*wordCount*/,
+ 1/*bytePadding*/);
+
+ serializer.writeWord(1);
+ // 1 byte leading padding | value 1 | trailing padding
+ // 0000 0000 | 0000 0000 0000 0000 1|000 0000
+ // 0x00 0x00 0x00 0x80
+ final byte[] bytes = serializer.getBytes();
+ final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 };
+ assertTrue(Arrays.equals(bytes, expectedBytes));
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/BitVectorTest.java b/solr/core/src/test/org/apache/solr/util/hll/BitVectorTest.java
new file mode 100644
index 0000000..bf6420b
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/BitVectorTest.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import java.util.Locale;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link BitVector}.
+ */
+public class BitVectorTest extends LuceneTestCase {
+ /**
+ * Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}.
+ */
+ @Test
+ public void getSetRegisterTest() {
+ { // locally scoped for sanity
+ // NOTE: registers are only 5bits wide
+ final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ vector1.setRegister(i, 0x1F);
+ vector2.setRegister(i, (i & 0x1F));
+ vector3.setRegister(i, ((127 - i) & 0x1F));
+ vector4.setRegister(i, 0x15);
+ }
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ assertEquals(vector1.getRegister(i), 0x1F);
+ assertEquals(vector2.getRegister(i), (i & 0x1F));
+ assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F));
+ assertEquals(vector4.getRegister(i), 0x15);
+ }
+ }
+ }
+
+ // ========================================================================
+ /**
+ * Tests {@link BitVector#registerIterator()}
+ */
+ @Test
+ public void registerIteratorTest() {
+ { // scoped locally for sanity
+ // NOTE: registers are only 5bits wide
+ final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+ final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ vector1.setRegister(i, 0x1F);
+ vector2.setRegister(i, (i & 0x1F));
+ vector3.setRegister(i, ((127 - i) & 0x1F));
+ vector4.setRegister(i, 0x15);
+ }
+
+ final LongIterator registerIterator1 = vector1.registerIterator();
+ final LongIterator registerIterator2 = vector2.registerIterator();
+ final LongIterator registerIterator3 = vector3.registerIterator();
+ final LongIterator registerIterator4 = vector4.registerIterator();
+ for(int i=0; i<128/*2^7*/; i++) {
+ assertEquals(registerIterator1.hasNext(), true);
+ assertEquals(registerIterator2.hasNext(), true);
+ assertEquals(registerIterator3.hasNext(), true);
+ assertEquals(registerIterator4.hasNext(), true);
+
+ assertEquals(registerIterator1.next(), 0x1F);
+ assertEquals(registerIterator2.next(), (i & 0x1F));
+ assertEquals(registerIterator3.next(), ((127 - i) & 0x1F));
+ assertEquals(registerIterator4.next(), 0x15);
+ }
+ assertEquals(registerIterator1.hasNext(), false/*no more*/);
+ assertEquals(registerIterator2.hasNext(), false/*no more*/);
+ assertEquals(registerIterator3.hasNext(), false/*no more*/);
+ assertEquals(registerIterator4.hasNext(), false/*no more*/);
+ }
+
+ { // scoped locally for sanity
+ // Vectors that are shorter than one word
+ assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */);
+ assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */);
+ assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */);
+ assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */);
+
+ // Vectors that don't fit exactly into longs
+ assertIterator(5, 16/* 5*16=80 bits */);
+ assertIterator(5, 32/* 5*32=160 bits */);
+ }
+
+ // Iterate over vectors that are padded
+ }
+
+ private static void assertIterator(final int width, final int count) {
+ final BitVector vector = new BitVector(width, count);
+ final LongIterator iter = vector.registerIterator();
+
+ for(int i=0; i<count; i++) {
+ assertTrue(String.format(Locale.ROOT, "expected more elements: width=%s, count=%s", width, count), iter.hasNext());
+ // TODO: fill with a sentinel value
+ assertEquals(iter.next(), 0);
+ }
+ assertFalse(String.format(Locale.ROOT, "expected no more elements: width=%s, count=%s", width, count), iter.hasNext());
+ }
+
+ // ========================================================================
+ /**
+ * Tests {@link BitVector#setMaxRegister(long, long)}
+ */
+ @Test
+ public void setMaxRegisterTest() {
+ final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
+
+ vector.setRegister(0, 10);
+ // should replace with a larger value
+ vector.setMaxRegister(0, 11);
+ assertEquals(vector.getRegister(0), 11);
+ // should not replace with a smaller or equal value
+ vector.setMaxRegister(0, 9);
+ assertEquals(vector.getRegister( 0), 11);
+ vector.setMaxRegister(0, 11);
+ assertEquals(vector.getRegister(0), 11);
+ }
+
+ // ========================================================================
+ // fill
+ /**
+ * Tests {@link BitVector#fill(long)}
+ */
+ @Test
+ public void fillTest() {
+ final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ vector.setRegister(i, i);
+ }
+
+ vector.fill(0L);
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ assertEquals(vector.getRegister(i), 0);
+ }
+
+ vector.fill(17L/*arbitrary*/);
+
+ for(int i=0; i<128/*2^7*/; i++) {
+ assertEquals(vector.getRegister(i), 17/*arbitrary*/);
+ }
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/test/org/apache/solr/util/hll/ExplicitHLLTest.java b/solr/core/src/test/org/apache/solr/util/hll/ExplicitHLLTest.java
new file mode 100644
index 0000000..1d7a85b
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/ExplicitHLLTest.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import java.util.HashSet;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.LongOpenHashSet;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+
+
+/**
+ * Tests {@link HLL} of type {@link HLLType#EXPLICIT}.
+ */
+public class ExplicitHLLTest extends LuceneTestCase {
+ /**
+ * Tests basic set semantics of {@link HLL#addRaw(long)}.
+ */
+ @Test
+ public void addBasicTest() {
+ { // Adding a single positive value to an empty set should work.
+ final HLL hll = newHLL(128/*arbitrary*/);
+ hll.addRaw(1L/*positive*/);
+ assertEquals(hll.cardinality(), 1L);
+ }
+ { // Adding a single negative value to an empty set should work.
+ final HLL hll = newHLL(128/*arbitrary*/);
+ hll.addRaw(-1L/*negative*/);
+ assertEquals(hll.cardinality(), 1L);
+ }
+ { // Adding a duplicate value to a set should be a no-op.
+ final HLL hll = newHLL(128/*arbitrary*/);
+ hll.addRaw(1L/*positive*/);
+ assertEquals(hll.cardinality(), 1L/*arbitrary*/);
+ assertEquals(hll.cardinality(), 1L/*dupe*/);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Tests {@link HLL#union(HLL)}.
+ */
+ @Test
+ public void unionTest() {
+ {// Unioning two distinct sets should work
+ final HLL hllA = newHLL(128/*arbitrary*/);
+ final HLL hllB = newHLL(128/*arbitrary*/);
+ hllA.addRaw(1L);
+ hllA.addRaw(2L);
+ hllB.addRaw(3L);
+
+ hllA.union(hllB);
+ assertEquals(hllA.cardinality(), 3);
+ }
+ {// Unioning two sets whose union doesn't exceed the cardinality cap should not promote
+ final HLL hllA = newHLL(128/*arbitrary*/);
+ final HLL hllB = newHLL(128/*arbitrary*/);
+ hllA.addRaw(1L);
+ hllA.addRaw(2L);
+ hllB.addRaw(1L);
+
+ hllA.union(hllB);
+ assertEquals(hllA.cardinality(), 2);
+ }
+ {// unioning two sets whose union exceeds the cardinality cap should promote
+ final HLL hllA = newHLL(128/*arbitrary*/);
+ final HLL hllB = newHLL(128/*arbitrary*/);
+
+ // fill up sets to explicitThreshold
+ for(long i=0; i<128/*explicitThreshold*/; i++) {
+ hllA.addRaw(i);
+ hllB.addRaw(i + 128);
+ }
+
+ hllA.union(hllB);
+ assertEquals(hllA.getType(), HLLType.SPARSE);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Tests {@link HLL#clear()}
+ */
+ @Test
+ public void clearTest() {
+ final HLL hll = newHLL(128/*arbitrary*/);
+ hll.addRaw(1L);
+ assertEquals(hll.cardinality(), 1L);
+ hll.clear();
+ assertEquals(hll.cardinality(), 0L);
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ */
+ @Test
+ public void toFromBytesTest() {
+ final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
+ final HLLType type = HLLType.EXPLICIT;
+ final int padding = schemaVersion.paddingBytes(type);
+ final int bytesPerWord = 8;
+
+ {// Should work on an empty set
+ final HLL hll = newHLL(128/*arbitrary*/);
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output has correct byte length
+ assertEquals(bytes.length, padding/*no elements, just padding*/);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a partially filled set
+ final HLL hll = newHLL(128/*arbitrary*/);
+
+ for(int i=0; i<3; i++) {
+ hll.addRaw(i);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output has correct byte length
+ assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a full set
+ final int explicitThreshold = 128;
+ final HLL hll = newHLL(explicitThreshold);
+
+ for(int i=0; i<explicitThreshold; i++) {
+ hll.addRaw(27 + i/*arbitrary*/);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output has correct byte length
+ assertEquals(bytes.length, padding + (bytesPerWord * explicitThreshold/*elements*/));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ assertElementsEqual(hll, inHLL);
+ }
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Tests correctness against {@link java.util.HashSet}.
+ */
+ @Test
+ public void randomValuesTest() {
+ final int explicitThreshold = 4096;
+ final HashSet<Long> canonical = new HashSet<Long>();
+ final HLL hll = newHLL(explicitThreshold);
+
+ for(int i=0;i<explicitThreshold;i++){
+ long randomLong = randomLong();
+ canonical.add(randomLong);
+ hll.addRaw(randomLong);
+ }
+ final int canonicalCardinality = canonical.size();
+ assertEquals(hll.cardinality(), canonicalCardinality);
+ }
+
+ // ------------------------------------------------------------------------
+ /**
+ * Tests promotion to {@link HLLType#SPARSE} and {@link HLLType#FULL}.
+ */
+ @Test
+ public void promotionTest() {
+ { // locally scoped for sanity
+ final int explicitThreshold = 128;
+ final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold*/, HLLType.EXPLICIT);
+
+ for(int i=0;i<explicitThreshold + 1;i++){
+ hll.addRaw(i);
+ }
+ assertEquals(hll.getType(), HLLType.SPARSE);
+ }
+ { // locally scoped for sanity
+ final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, 4/*expthresh => explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT);
+
+ for(int i=0;i<9/* > explicitThreshold */;i++){
+ hll.addRaw(i);
+ }
+ assertEquals(hll.getType(), HLLType.FULL);
+ }
+ }
+
+ // ************************************************************************
+ // assertion helpers
+ /**
+ * Asserts that values in both sets are exactly equal.
+ */
+ private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
+ final LongOpenHashSet internalSetA = hllA.explicitStorage;
+ final LongOpenHashSet internalSetB = hllB.explicitStorage;
+
+ assertTrue(internalSetA.equals(internalSetB));
+ }
+
+ /**
+ * Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified
+ * explicit threshold.
+ *
+ * @param explicitThreshold explicit threshold to use for the constructed
+ * {@link HLL}. This must be greater than zero.
+ * @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance.
+ * This will never be <code>null</code>.
+ */
+ private static HLL newHLL(final int explicitThreshold) {
+ return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT);
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/test/org/apache/solr/util/hll/FullHLLTest.java b/solr/core/src/test/org/apache/solr/util/hll/FullHLLTest.java
new file mode 100644
index 0000000..dd5f21f
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/FullHLLTest.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/**
+ * Tests {@link HLL} of type {@link HLLType#FULL}.
+ */
+public class FullHLLTest extends LuceneTestCase {
+ // TODO union test
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the
+ * small range correction.
+ */
+ @Test
+ public void smallRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+
+ // only one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
+
+ final long cardinality = hll.cardinality();
+
+ // Trivially true that small correction conditions hold: one register
+ // set implies zeroes exist, and estimator trivially smaller than 5m/2.
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+
+ // all but one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ for(int i=0; i<(m - 1); i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
+ }
+
+ // Trivially true that small correction conditions hold: all but
+ // one register set implies a zero exists, and estimator trivially
+ // smaller than 5m/2 since it's alpha / ((m-1)/2)
+ final long cardinality = hll.cardinality();
+
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the
+ * uncorrected estimator
+ */
+ @Test
+ public void normalRangeSmokeTest() {
+ final int log2m = 11;
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+ final int m = (1 << log2m);
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+
+ // all registers at 'medium' value
+ {
+ final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for uncorrected range
+ assertTrue(estimator <= Math.pow(2, l)/30);
+ assertTrue(estimator > (5 * m /(double)2));
+
+ final long expected = (long)Math.ceil(estimator);
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the large
+ * range correction.
+ */
+ @Test
+ public void largeRangeSmokeTest() {
+ final int log2m = 12;
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+ final int m = (1 << log2m);
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+
+ {
+ final int registerValue = 31/*chosen to ensure large correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for large range
+
+ assertTrue(estimator > Math.pow(2,l)/30);
+
+ // Large range correction: -2^L * log(1 - E/2^L)
+ final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ // ========================================================================
+ /**
+ * Tests the bounds on a register's value for a given raw input value.
+ */
+ @Test
+ public void registerValueTest() {
+ final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
+
+ // register width 4 (the minimum size)
+ { // scoped locally for sanity
+ final int regwidth = 4;
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ final BitVector bitVector = hll.probabilisticStorage;
+
+ // lower-bounds of the register
+ hll.addRaw(0x000000000000001L/*'j'=1*/);
+ assertEquals(bitVector.getRegister(1/*'j'*/), 0);
+
+ hll.addRaw(0x0000000000000012L/*'j'=2*/);
+ assertEquals(bitVector.getRegister(2/*'j'*/), 1);
+
+ hll.addRaw(0x0000000000000023L/*'j'=3*/);
+ assertEquals(bitVector.getRegister(3/*'j'*/), 2);
+
+ hll.addRaw(0x0000000000000044L/*'j'=4*/);
+ assertEquals(bitVector.getRegister(4/*'j'*/), 3);
+
+ hll.addRaw(0x0000000000000085L/*'j'=5*/);
+ assertEquals(bitVector.getRegister(5/*'j'*/), 4);
+
+ // upper-bounds of the register
+ // NOTE: bear in mind that BitVector itself does ensure that
+ // overflow of a register is prevented
+ hll.addRaw(0x0000000000010006L/*'j'=6*/);
+ assertEquals(bitVector.getRegister(6/*'j'*/), 13);
+
+ hll.addRaw(0x0000000000020007L/*'j'=7*/);
+ assertEquals(bitVector.getRegister(7/*'j'*/), 14);
+
+ hll.addRaw(0x0000000000040008L/*'j'=8*/);
+ assertEquals(bitVector.getRegister(8/*'j'*/), 15);
+
+ hll.addRaw(0x0000000000080009L/*'j'=9*/);
+ assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
+
+ // sanity checks to ensure that no other bits above the lowest-set
+ // bit matters
+ // NOTE: same as case 'j = 6' above
+ hll.addRaw(0x000000000003000AL/*'j'=10*/);
+ assertEquals(bitVector.getRegister(10/*'j'*/), 13);
+
+ hll.addRaw(0x000000000011000BL/*'j'=11*/);
+ assertEquals(bitVector.getRegister(11/*'j'*/), 13);
+ }
+
+ // register width 5
+ { // scoped locally for sanity
+ final int regwidth = 5;
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ final BitVector bitVector = hll.probabilisticStorage;
+
+ // lower-bounds of the register
+ hll.addRaw(0x0000000000000001L/*'j'=1*/);
+ assertEquals(bitVector.getRegister(1/*'j'*/), 0);
+
+ hll.addRaw(0x0000000000000012L/*'j'=2*/);
+ assertEquals(bitVector.getRegister(2/*'j'*/), 1);
+
+ hll.addRaw(0x0000000000000023L/*'j'=3*/);
+ assertEquals(bitVector.getRegister(3/*'j'*/), 2);
+
+ hll.addRaw(0x0000000000000044L/*'j'=4*/);
+ assertEquals(bitVector.getRegister(4/*'j'*/), 3);
+
+ hll.addRaw(0x0000000000000085L/*'j'=5*/);
+ assertEquals(bitVector.getRegister(5/*'j'*/), 4);
+
+ // upper-bounds of the register
+ // NOTE: bear in mind that BitVector itself does ensure that
+ // overflow of a register is prevented
+ hll.addRaw(0x0000000100000006L/*'j'=6*/);
+ assertEquals(bitVector.getRegister(6/*'j'*/), 29);
+
+ hll.addRaw(0x0000000200000007L/*'j'=7*/);
+ assertEquals(bitVector.getRegister(7/*'j'*/), 30);
+
+ hll.addRaw(0x0000000400000008L/*'j'=8*/);
+ assertEquals(bitVector.getRegister(8/*'j'*/), 31);
+
+ hll.addRaw(0x0000000800000009L/*'j'=9*/);
+ assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
+ }
+ }
+
+ // ========================================================================
+ /**
+ * Tests {@link HLL#clear()}.
+ */
+ @Test
+ public void clearTest() {
+ final int regwidth = 5;
+ final int log2m = 4/*16 registers per counter*/;
+ final int m = 1 << log2m;
+
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ final BitVector bitVector = hll.probabilisticStorage;
+ for(int i=0; i<m; i++)
+ bitVector.setRegister(i, i);
+
+ hll.clear();
+ for(int i=0; i<m; i++){
+ assertEquals(bitVector.getRegister(i), 0L/*default value of register*/);
+ }
+ }
+
+ // ========================================================================
+ // Serialization
+ /**
+ * Tests {@link HLL#toBytes(ISchemaVersion)} and {@link HLL#fromBytes(byte[])}.
+ */
+ @Test
+ public void toFromBytesTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int regwidth = 5;
+
+ final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
+ final HLLType type = HLLType.FULL;
+ final int padding = schemaVersion.paddingBytes(type);
+ final int dataByteCount = ProbabilisticTestUtil.getRequiredBytes(regwidth, (1 << log2m)/*aka 2^log2m = m*/);
+ final int expectedByteCount = padding + dataByteCount;
+
+ {// Should work on an empty element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output length is correct
+ assertEquals(bytes.length, expectedByteCount);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a partially filled element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+
+ for(int i=0; i<3; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output length is correct
+ assertEquals(bytes.length, expectedByteCount);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a full set
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
+
+ for(int i=0; i<(1 << log2m)/*aka 2^log2m*/; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // assert output length is correct
+ assertEquals(bytes.length, expectedByteCount);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ }
+
+ // ************************************************************************
+ // Assertion Helpers
+ /**
+ * Asserts that the two HLLs are register-wise equal.
+ */
+ private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
+ final BitVector bitVectorA = hllA.probabilisticStorage;
+ final BitVector bitVectorB = hllB.probabilisticStorage;
+
+ final LongIterator iterA = bitVectorA.registerIterator();
+ final LongIterator iterB = bitVectorB.registerIterator();
+
+ for(;iterA.hasNext() && iterB.hasNext();) {
+ assertEquals(iterA.next(), iterB.next());
+ }
+ assertFalse(iterA.hasNext());
+ assertFalse(iterB.hasNext());
+ }
+}
\ No newline at end of file
diff --git a/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java b/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java
new file mode 100644
index 0000000..1717ac3
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/HLLSerializationTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import static org.apache.solr.util.hll.HLL.*;
+
+/**
+ * Serialization smoke-tests.
+ */
+public class HLLSerializationTest extends LuceneTestCase {
+ /**
+ * A smoke-test that covers serialization/deserialization of an HLL
+ * under all possible parameters.
+ */
+ @Test
+ @Slow
+ @Nightly
+ public void serializationSmokeTest() throws Exception {
+ final Random random = new Random(randomLong());
+ final int randomCount = 250;
+ final List<Long> randoms = new ArrayList<Long>(randomCount);
+ for (int i=0; i<randomCount; i++) {
+ randoms.add(random.nextLong());
+ }
+
+ assertCardinality(HLLType.EMPTY, randoms);
+ assertCardinality(HLLType.EXPLICIT, randoms);
+ assertCardinality(HLLType.SPARSE, randoms);
+ assertCardinality(HLLType.FULL, randoms);
+ }
+
+ // NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
+ // completes in a reasonable amount of time. Not much is gained by
+ // testing larger values - there are no more known serialization
+ // related edge cases that appear as log2m gets even larger.
+ // NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
+ // on 2014-01-30.
+ private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
+ throws CloneNotSupportedException {
+ for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
+ for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
+ for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
+ for(final boolean sparse: new boolean[]{true, false}) {
+ HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
+ for(final Long item: items) {
+ hll.addRaw(item);
+ }
+ HLL copy = HLL.fromBytes(hll.toBytes());
+ assertEquals(copy.cardinality(), hll.cardinality());
+ assertEquals(copy.getType(), hll.getType());
+ assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
+
+ HLL clone = hll.clone();
+ assertEquals(clone.cardinality(), hll.cardinality());
+ assertEquals(clone.getType(), hll.getType());
+ assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/HLLUtilTest.java b/solr/core/src/test/org/apache/solr/util/hll/HLLUtilTest.java
new file mode 100644
index 0000000..583384d
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/HLLUtilTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/**
+ * Tests {@link HLLUtil} static methods.
+ */
+public class HLLUtilTest extends LuceneTestCase {
+ /**
+ * Tests that {@link HLLUtil#largeEstimatorCutoff(int, int)} is the same
+ * as a trivial implementation.
+ */
+ @Test
+ public void largeEstimatorCutoffTest() {
+ for(int log2m=HLL.MINIMUM_LOG2M_PARAM; log2m<=HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
+ for(int regWidth=HLL.MINIMUM_REGWIDTH_PARAM; regWidth<=HLL.MINIMUM_REGWIDTH_PARAM; regWidth++) {
+ final double cutoff = HLLUtil.largeEstimatorCutoff(log2m, regWidth);
+
+ // See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
+ // and original paper (Fig. 3) for information on 2^L and
+ // "large range correction" cutoff.
+ final double expected = Math.pow(2, Math.pow(2, regWidth) - 2 + log2m) / 30.0;
+ assertEquals(cutoff, expected, 0.0001);
+ }
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java b/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java
new file mode 100644
index 0000000..edc2c9c
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/IntegrationTestGenerator.java
@@ -0,0 +1,712 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
+import static org.apache.solr.util.hll.ProbabilisticTestUtil.*;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Random;
+
+/**
+ * Generates test files for testing other implementations of HLL
+ * serialization/deserialization, namely the PostgreSQL implementation.
+ */
+public class IntegrationTestGenerator {
+ // ************************************************************************
+ // directory to output the generated tests
+ private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
+
+ // ------------------------------------------------------------------------
+ // configurations for HLLs, should mirror settings in PostgreSQL impl. tests
+ private static final int REGWIDTH = 5;
+ private static final int LOG2M = 11;
+ // NOTE: This differs from the PostgreSQL impl. parameter 'expthresh'. This
+ // is a literal threshold to use in the promotion hierarchy, implying
+ // that both EXPLICIT representation should be used and it should
+ // NOT be automatically computed. This is done to ensure that the
+ // parameters of the test are very explicitly defined.
+ private static final int EXPLICIT_THRESHOLD = 256;
+ // NOTE: This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
+ // is assumed to be true and this is a literal register-count threshold
+ // to use in the promotion hierarchy. This is done to ensure that the
+ // parameters of the test are very explicitly defined.
+ private static final int SPARSE_THRESHOLD = 850;
+
+ // ------------------------------------------------------------------------
+ // computed constants
+ private static final int REGISTER_COUNT = (1 << LOG2M);
+ private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
+
+ // ========================================================================
+ // Tests
+ /**
+ * Cumulatively adds random values to a FULL HLL through the small range
+ * correction, uncorrected range, and large range correction of the HLL's
+ * cardinality estimator.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - FULL cardinality computation
+ */
+ private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.FULL);
+ initLineAdd(output, hll, schemaVersion);
+
+ // run through some values in the small range correction
+ for(int i=0; i<((1 << LOG2M) - 1); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 1);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ // run up past some values in the uncorrected range
+ for(int i=0; i<(1 << LOG2M); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 7);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ // run through some values in the large range correction
+ for(int i=0; i<(1 << LOG2M); i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, 30);
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively adds random values to an EMPTY HLL.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
+ * - EMPTY to EXPLICIT promotion
+ * - EXPLICIT to SPARSE promotion
+ * - SPARSE to FULL promotion
+ */
+ private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<10000/*arbitrary*/; i++) {
+ cumulativeAddLine(output, hll, randomLong(), schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively unions "underpopulated" FULL HLLs into the
+ * accumulator to verify the correct behavior from the PostgreSQL implementation.
+ * The PostgreSQL implementation's representations of probabilistic HLLs should
+ * depend exclusively on the chosen SPARSE-to-FULL cutoff.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U "underpopulated" FULL => SPARSE
+ * - SPARSE U "underpopulated" FULL => SPARSE
+ * - SPARSE U "barely underpopulated" FULL => FULL
+ */
+ private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
+
+ final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
+
+ // NOTE: In this test the sparseReference will be the "expected" value
+ // from the C representation, since it doesn't choose representation
+ // based on original encoding, but rather on the promotion rules
+ // and the declared type of the "receiving" field.
+ // It is the manually-constructed union result.
+
+ // "underpopulated" FULL U EMPTY => SPARSE
+ final HLL fullHLL = newHLL(HLLType.FULL);
+ fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+ sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
+
+ output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+
+ // "underpopulated" FULL (small) U SPARSE (small) => SPARSE
+ final HLL fullHLL2 = newHLL(HLLType.FULL);
+ fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+ sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
+
+ output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+
+ // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
+ final HLL fullHLL3 = newHLL(HLLType.FULL);
+ for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
+ fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+ sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
+ }
+
+ output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Cumulatively sets successive registers to:
+ *
+ * <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
+ *
+ * by adding specifically constructed values to a SPARSE HLL.
+ * Does not induce promotion.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition (predictable)
+ */
+ private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
+
+ // the accumulator, starts empty sparse probabilistic
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively sets random registers of a SPARSE HLL to
+ * random values by adding random values. Does not induce promotion.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition (random)
+ */
+ private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+
+ cumulativeAddLine(output, hll, rawValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Cumulatively sets the first register (index 0) to value 2, the last
+ * register (index m-1) to value 2, and then sets registers with indices in
+ * the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
+ *
+ * This tests for register alignment in the promotion from SPARSE
+ * to FULL.
+ *
+ * Format: cumulative add
+ * Tests:
+ * - SPARSE addition
+ * - SPARSE to FULL promotion
+ */
+ private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.SPARSE);
+ initLineAdd(output, hll, schemaVersion);
+
+ final long firstValue = constructHLLValue(LOG2M, 0, 2);
+ cumulativeAddLine(output, hll, firstValue, schemaVersion);
+
+ final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
+ cumulativeAddLine(output, hll, lastValue, schemaVersion);
+
+ for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
+ final long middleValue = constructHLLValue(LOG2M, i, 1);
+
+ cumulativeAddLine(output, hll, middleValue, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
+ * single random value.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U EXPLICIT
+ * - EXPLICIT U EXPLICIT
+ * - EXPLICIT to SPARSE promotion
+ * - SPARSE U EXPLICIT
+ */
+ private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
+ // make an EXPLICIT set and populate with cardinality 1
+ final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+ explicitHLL.addRaw(random.nextLong());
+
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with SPARSE HLLs, each
+ * having one register set.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U SPARSE
+ * - SPARSE U SPARSE
+ * - SPARSE promotion
+ * - SPARSE U FULL
+ */
+ private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+
+ for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
+ // make a SPARSE set and populate with cardinality 1
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+ sparseHLL.addRaw(rawValue);
+
+ cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
+ * random value, twice in a row to verify that the set properties are
+ * satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U EXPLICIT
+ * - EXPLICIT U EXPLICIT
+ */
+ private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
+ // make an EXPLICIT set and populate with cardinality 1
+ final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
+ explicitHLL.addRaw(random.nextLong());
+
+ // union it into the accumulator twice, to test overlap (cardinality should not change)
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with SPARSE HLLs, each
+ * having a single register set, twice in a row to verify that the set
+ * properties are satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U SPARSE
+ * - SPARSE U SPARSE
+ */
+ private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<SPARSE_THRESHOLD; i++) {
+ // make a SPARSE set and populate with cardinality 1
+ final HLL sparseHLL = newHLL(HLLType.SPARSE);
+ final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
+ final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
+ final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
+ sparseHLL.addRaw(rawValue);
+
+ cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with FULL HLLs, each having
+ * many registers set, twice in a row to verify that the set properties are
+ * satisfied.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - EMPTY U FULL
+ * - FULL U FULL
+ */
+ private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
+
+ final Random random = new Random(randomLong());
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<1000/*number of rows to generate*/; i++) {
+ // make a FULL set and populate with
+ final HLL fullHLL = newHLL(HLLType.FULL);
+ final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
+ for(int j=0;j<elementCount;j++) {
+ fullHLL.addRaw(random.nextLong());
+ }
+
+ cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ /**
+ * Unions an EMPTY accumulator with random HLLs.
+ *
+ * Format: cumulative union
+ * Tests:
+ * - hopefully all union possibilities
+ */
+ private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
+ final Writer output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
+
+ // the accumulator, starts empty
+ final HLL hll = newHLL(HLLType.EMPTY);
+ final HLL emptyHLL = newHLL(HLLType.EMPTY);
+
+ cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
+
+ for(int i=0; i<1000/*number of rows to generate*/; i++) {
+ final HLL randomHLL = generateRandomHLL();
+ cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
+ }
+
+ output.flush();
+ output.close();
+ }
+
+ // ========================================================================
+ // Main
+ public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
+ fullCardinalityCorrectionTest(schemaVersion);
+ globalUnionTest(schemaVersion);
+ globalStepTest(schemaVersion);
+ probabilisticUnionTest(schemaVersion);
+ explicitPromotionTest(schemaVersion);
+ explicitOverlapTest(schemaVersion);
+ sparseFullRepresentationTest(schemaVersion);
+ sparseStepTest(schemaVersion);
+ sparseRandomTest(schemaVersion);
+ sparseEdgeTest(schemaVersion);
+ sparseProbabilisticPromotionTest(schemaVersion);
+ sparseProbabilisticOverlapTest(schemaVersion);
+ }
+
+ public static void main(String[] args) throws IOException {
+ fullSuite(SerializationUtil.VERSION_ONE);
+ }
+
+ // ************************************************************************
+ // Helpers
+ /**
+ * Shortcut for testing constructor, which uses the constants defined at
+ * the top of the file as default parameters.
+ *
+ * @return a new {@link HLL} of specified type, which uses the parameters
+ * ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
+ * and {@link #SPARSE_THRESHOLD}) specified above.
+ */
+ private static HLL newHLL(final HLLType type) {
+ return newHLL(type);
+ }
+
+ /**
+ * Returns the algorithm-specific cardinality of the specified {@link HLL}
+ * as a {@link String} appropriate for comparison with the algorithm-specific
+ * cardinality provided by the PostgreSQL implementation.
+ *
+ * @param hll the HLL whose algorithm-specific cardinality is to be printed.
+ * This cannot be <code>null</code>.
+ * @return the algorithm-specific cardinality of the instance as a PostgreSQL-
+ * compatible String. This will never be <code>null</code>
+ */
+ private static String stringCardinality(final HLL hll) {
+ switch(hll.getType()) {
+ case EMPTY:
+ return "0";
+ case EXPLICIT:/*promotion has not yet occurred*/
+ return Long.toString(hll.cardinality());
+ case SPARSE:
+ return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
+ case FULL:
+ return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
+ default:
+ throw new RuntimeException("Unknown HLL type " + hll.getType());
+ }
+ }
+
+ /**
+ * Generates a random HLL and populates it with random values.
+ *
+ * @return the populated HLL. This will never be <code>null</code>.
+ */
+ public static HLL generateRandomHLL() {
+ final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1);
+ final HLLType type;
+ switch(randomTypeInt) {
+ case 0:
+ type = HLLType.EMPTY;
+ break;
+ case 1:
+ type = HLLType.EXPLICIT;
+ break;
+ case 2:
+ type = HLLType.FULL;
+ break;
+ case 3:
+ type = HLLType.EMPTY;
+ break;
+ case 4:
+ type = HLLType.SPARSE;
+ break;
+ default:
+ throw new RuntimeException("Unassigned type int " + randomTypeInt);
+ }
+
+ final int cardinalityCap;
+ final int cardinalityBaseline;
+
+ switch(type) {
+ case EMPTY:
+ return newHLL(HLLType.EMPTY);
+ case EXPLICIT:
+ cardinalityCap = EXPLICIT_THRESHOLD;
+ cardinalityBaseline = 1;
+ break;
+ case SPARSE:
+ cardinalityCap = SPARSE_THRESHOLD;
+ cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
+ break;
+ case FULL:
+ cardinalityCap = 100000;
+ cardinalityBaseline = (SPARSE_THRESHOLD*10);
+ break;
+ default:
+ throw new RuntimeException("We should never be here.");
+ }
+
+ final HLL hll = newHLL(HLLType.EMPTY);
+ for(int i=0; i<cardinalityBaseline; i++) {
+ hll.addRaw(randomLong());
+ }
+ for(int i=0; i<randomInt(cardinalityCap - cardinalityBaseline); i++) {
+ hll.addRaw(randomLong());
+ }
+
+ return hll;
+ }
+
+ /**
+ * Opens a {@link Writer} and writes out an appropriate CSV header.
+ *
+ * @param schemaVersion Schema version of the output. This cannot be
+ * <code>null</code>.
+ * @param description Description string used to build the filename.
+ * This cannot be <code>null</code>.
+ * @param type {@link TestType type} of the test file to be written.
+ * This cannot be <code>null</code>.
+ * @return The opened {@link Writer writer}. This will never be <code>null</code>.
+ */
+ private static Writer openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
+ final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
+ final String header;
+ final String filename;
+ switch(type) {
+ case ADD:
+ header = "cardinality,raw_value,HLL\n";
+ filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
+ break;
+ case UNION:
+ header = "cardinality,HLL,union_cardinality,union_HLL\n";
+ filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
+ break;
+ default:
+ throw new RuntimeException("Unknown test type " + type);
+ }
+
+ final Writer output = Files.newBufferedWriter(
+ Paths.get(OUTPUT_DIRECTORY, filename), StandardCharsets.UTF_8);
+ output.write(header);
+ output.flush();
+ return output;
+ }
+
+ /**
+ * Writes out a {@link TestType#ADD}-formatted test line.
+ *
+ * @param output The output {@link Writer writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param rawValue The raw value added to the HLL.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void cumulativeAddLine(final Writer output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
+ hll.addRaw(rawValue);
+ final String accumulatorCardinality = stringCardinality(hll);
+
+ output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Writes an initial line for a {@link TestType#ADD}-formatted test.
+ *
+ * @param output The output {@link Writer writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void initLineAdd(final Writer output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
+ output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Writes out a {@link TestType#UNION}-formatted test line.
+ *
+ * @param output The output {@link Writer writer}. This cannot be <code>null</code>.
+ * @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
+ * @param increment The "increment" HLL instance which will be unioned into
+ * the accumulator. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ */
+ private static void cumulativeUnionLine(final Writer output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
+ hll.union(increment);
+
+ final String incrementCardinality = stringCardinality(increment);
+ final String accumulatorCardinality = stringCardinality(hll);
+ output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
+ output.flush();
+ }
+
+ /**
+ * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
+ *
+ * @param hll the HLL to serialize. This cannot be <code>null</code>.
+ * @param schemaVersion the schema with which to serialize the HLLs. This cannot
+ * be <code>null</code>.
+ * @return a PostgreSQL 'bytea' string representing the HLL.
+ */
+ private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
+ final byte[] bytes = hll.toBytes(schemaVersion);
+ return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
+ }
+
+ /**
+ * Indicates what kind of test output a test will generate.
+ */
+ private static enum TestType {
+ /**
+ * This type of test is characterized by values being added to an
+ * accumulator HLL whose serialized representation (after the value is added)
+ * is printed to each line along with the cardinality and added value.
+ */
+ ADD,
+ /**
+ * This type of test is characterized by HLLs being unioned into an
+ * accumulator HLL whose serialized representation (after the HLL is
+ * union'd) is printed to each line along with the cardinalities and the
+ * serialized representation of the HLL union'd in.
+ */
+ UNION;
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java b/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java
new file mode 100644
index 0000000..9bf7465
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/ProbabilisticTestUtil.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+/**
+ * A collection of test utilities for constructing input values to HLLs and for
+ * computing their serialized size.
+ */
+public class ProbabilisticTestUtil {
+ /**
+ * Constructs a value that when added raw to a HLL will set the register at
+ * <code>registerIndex</code> to <code>registerValue</code>.
+ *
+ * @param log2m the log-base-2 of the number of registers in the HLL
+ * @param registerIndex the index of the register to set
+ * @param registerValue the value to set the register to
+ * @return the value
+ */
+ public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
+ final long partition = registerIndex;
+ final long substreamValue = (1L << (registerValue - 1));
+ return (substreamValue << log2m) | partition;
+ }
+
+ /**
+ * Extracts the HLL register index from a raw value.
+ */
+ public static short getRegisterIndex(final long rawValue, final int log2m) {
+ final long mBitsMask = (1 << log2m) - 1;
+ final short j = (short)(rawValue & mBitsMask);
+ return j;
+ }
+
+ /**
+ * Extracts the HLL register value from a raw value.
+ */
+ public static byte getRegisterValue(final long rawValue, final int log2m) {
+ final long substreamValue = (rawValue >>> log2m);
+ final byte p_w;
+
+ if (substreamValue == 0L) {
+ // The paper does not cover p(0x0), so the special value 0 is used.
+ // 0 is the original initialization value of the registers, so by
+ // doing this the HLL simply ignores it. This is acceptable
+ // because the probability is 1/(2^(2^registerSizeInBits)).
+ p_w = 0;
+ } else {
+ p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
+ }
+
+ return p_w;
+ }
+
+ /**
+ * @return the number of bytes required to pack <code>registerCount</code>
+ * registers of width <code>shortWordLength</code>.
+ */
+ public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
+ return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java b/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java
new file mode 100644
index 0000000..c62d773
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/hll/SparseHLLTest.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.hll;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntByteOpenHashMap;
+import com.carrotsearch.hppc.cursors.IntByteCursor;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+
+/**
+ * Tests {@link HLL} of type {@link HLLType#SPARSE}.
+ */
+public class SparseHLLTest extends LuceneTestCase {
+ private static final int log2m = 11;
+
+ /**
+ * Tests {@link HLL#addRaw(long)}.
+ */
+ @Test
+ public void addTest() {
+ { // insert an element with register value 1 (minimum set value)
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert an element with register value 31 (maximum set value)
+ final int registerIndex = 0;
+ final int registerValue = 31;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert an element that could overflow the register (past 31)
+ final int registerIndex = 0;
+ final int registerValue = 36;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
+ }
+ { // insert duplicate elements, observe no change
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+ hll.addRaw(rawValue);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ { // insert elements that increase a register's value
+ final int registerIndex = 0;
+ final int registerValue = 1;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ final int registerValue2 = 2;
+ final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+ hll.addRaw(rawValue2);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
+ }
+ { // insert elements that have lower register values, observe no change
+ final int registerIndex = 0;
+ final int registerValue = 2;
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
+
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(rawValue);
+
+ final int registerValue2 = 1;
+ final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
+ hll.addRaw(rawValue2);
+
+ assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the small
+ * range correction.
+ */
+ @Test
+ public void smallRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+
+ // only one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
+
+ final long cardinality = hll.cardinality();
+
+ // Trivially true that small correction conditions hold: one register
+ // set implies zeroes exist, and estimator trivially smaller than 5m/2.
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+
+ // all but one register set
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
+ for(int i=0; i<(m - 1); i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+ }
+
+ // Trivially true that small correction conditions hold: all but
+ // one register set implies a zero exists, and estimator trivially
+ // smaller than 5m/2 since it's alpha / ((m-1)/2)
+ final long cardinality = hll.cardinality();
+
+ // Small range correction: m * log(m/V)
+ final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the
+ * uncorrected estimator.
+ */
+ @Test
+ public void normalRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+
+ // all registers at 'medium' value
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+ final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for uncorrected range
+ assertTrue(estimator <= Math.pow(2,l)/30);
+ assertTrue(estimator > (5 * m /(double)2));
+
+ final long expected = (long)Math.ceil(estimator);
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Smoke test for {@link HLL#cardinality()} and the proper use of the large
+ * range correction.
+ */
+ @Test
+ public void largeRangeSmokeTest() {
+ final int log2m = 11;
+ final int m = (1 << log2m);
+ final int regwidth = 5;
+ // regwidth = 5, so hash space is
+ // log2m + (2^5 - 1 - 1), so L = log2m + 30
+ final int l = log2m + 30;
+
+ // all registers at large value
+ {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
+
+ final int registerValue = 31/*chosen to ensure large correction kicks in*/;
+ for(int i=0; i<m; i++) {
+ hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
+ }
+
+ final long cardinality = hll.cardinality();
+
+
+ // Simplified estimator when all registers take same value: alpha / (m/2^val)
+ final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
+
+ // Assert conditions for large range
+ assertTrue(estimator > Math.pow(2, l)/30);
+
+ // Large range correction: -2^32 * log(1 - E/2^32)
+ final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
+ assertEquals(cardinality, expected);
+ }
+ }
+
+ /**
+ * Tests {@link HLL#union(HLL)}.
+ */
+ @Test
+ public void unionTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+
+ { // two empty multisets should union to an empty set
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 0L);
+ }
+ { // two disjoint multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
+
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 3L/*precomputed*/);
+ assertRegisterPresent(hllA, 1, (byte)1);
+ assertRegisterPresent(hllA, 2, (byte)1);
+ }
+ { // two exactly overlapping multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
+ assertEquals(hllA.cardinality(), 2L/*precomputed*/);
+ assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
+ }
+ { // overlapping multisets should union properly
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ // register index = 3
+ final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
+
+ // register index = 4
+ final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
+ final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
+
+ // register index = 5
+ final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
+
+ hllA.addRaw(rawValueA);
+ hllA.addRaw(rawValueB);
+
+ hllB.addRaw(rawValueBPrime);
+ hllB.addRaw(rawValueC);
+
+ hllA.union(hllB);
+ // union should have three registers set, with partition B set to the
+ // max of the two registers
+ assertRegisterPresent(hllA, 3, (byte)11);
+ assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
+ assertRegisterPresent(hllA, 5, (byte)14);
+ }
+ { // too-large unions should promote
+ final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ // fill up sets to maxCapacity
+ for(int i=0; i<sparseThreshold; i++) {
+ hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
+ hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
+ }
+
+ hllA.union(hllB);
+
+ assertEquals(hllA.getType(), HLLType.FULL);
+ }
+ }
+
+ /**
+ * Tests {@link HLL#clear()}.
+ */
+ @Test
+ public void clearTest() {
+ final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
+ hll.addRaw(1L);
+ hll.clear();
+ assertEquals(hll.cardinality(), 0L);
+ }
+
+ /**
+ * Tests {@link HLL#toBytes(ISchemaVersion)} and
+ * {@link HLL#fromBytes(byte[])}.
+ */
+ @Test
+ public void toFromBytesTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int regwidth = 5/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+ final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
+
+ final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
+ final HLLType type = HLLType.SPARSE;
+ final int padding = schemaVersion.paddingBytes(type);
+
+ {// Should work on an empty element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // output should just be padding since no registers are used
+ assertEquals(bytes.length, padding);
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a partially filled element
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ for(int i=0; i<3; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ {// Should work on a full set
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ for(int i=0; i<sparseThreshold; i++) {
+ final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
+ hll.addRaw(rawValue);
+ }
+
+ final byte[] bytes = hll.toBytes(schemaVersion);
+
+ // 'short words' should be 12 bits + 5 bits = 17 bits long
+ assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
+
+ final HLL inHLL = HLL.fromBytes(bytes);
+
+ // assert register values correct
+ assertElementsEqual(hll, inHLL);
+ }
+ }
+
+ /**
+ * Smoke tests the multisets by adding random values.
+ */
+ @Test
+ public void randomValuesTest() {
+ final int log2m = 11/*arbitrary*/;
+ final int regwidth = 5/*arbitrary*/;
+ final int sparseThreshold = 256/*arbitrary*/;
+
+ for(int run=0; run<100; run++) {
+ final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
+
+ final IntByteOpenHashMap map = new IntByteOpenHashMap();
+
+ for(int i=0; i<sparseThreshold; i++) {
+ final long rawValue = RandomizedTest.randomLong();
+
+ final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
+ final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
+ if(map.get(registerIndex) < registerValue) {
+ map.put(registerIndex, registerValue);
+ }
+
+ hll.addRaw(rawValue);
+ }
+
+ for (IntByteCursor c : map) {
+ final byte expectedRegisterValue = map.get(c.key);
+ assertRegisterPresent(hll, c.key, expectedRegisterValue);
+ }
+ }
+ }
+
+ //*************************************************************************
+ // assertion helpers
+ /**
+ * Asserts that the register at the specified index is set to the specified
+ * value.
+ */
+ private static void assertRegisterPresent(final HLL hll,
+ final int registerIndex,
+ final int registerValue) {
+ final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+ }
+
+ /**
+ * Asserts that only the specified register is set and has the specified value.
+ */
+ private static void assertOneRegisterSet(final HLL hll,
+ final int registerIndex,
+ final byte registerValue) {
+ final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorage.size(), 1);
+ assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
+ }
+
+ /**
+ * Asserts that all registers in the two {@link HLL} instances are identical.
+ */
+ private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
+ final IntByteOpenHashMap sparseProbabilisticStorageA = hllA.sparseProbabilisticStorage;
+ final IntByteOpenHashMap sparseProbabilisticStorageB = hllB.sparseProbabilisticStorage;
+ assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
+ for (IntByteCursor c : sparseProbabilisticStorageA) {
+ assertEquals(sparseProbabilisticStorageA.get(c.key),
+ sparseProbabilisticStorageB.get(c.key));
+ }
+ }
+}