blob: 3c742c9843cdd6e7433a9470f4fe174e2b0e411a [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.load.loadv2.dpp;
import org.apache.doris.load.loadv2.dpp.Hll;
import org.junit.Assert;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
public class HllTest {
@Test
public void testFindFirstNonZeroBitPosition() {
Assert.assertTrue(Hll.getLongTailZeroNum(0) == 0);
Assert.assertTrue(Hll.getLongTailZeroNum(1) == 0);
Assert.assertTrue(Hll.getLongTailZeroNum(1l << 30) == 30);
Assert.assertTrue(Hll.getLongTailZeroNum(1l << 62) == 62);
}
@Test
public void HllBasicTest() throws IOException {
// test empty
Hll emptyHll = new Hll();
Assert.assertTrue(emptyHll.getType() == Hll.HLL_DATA_EMPTY);
Assert.assertTrue(emptyHll.estimateCardinality() == 0);
ByteArrayOutputStream emptyOutputStream = new ByteArrayOutputStream();
DataOutput output = new DataOutputStream(emptyOutputStream);
emptyHll.serialize(output);
DataInputStream emptyInputStream = new DataInputStream(new ByteArrayInputStream(emptyOutputStream.toByteArray()));
Hll deserializedEmptyHll = new Hll();
deserializedEmptyHll.deserialize(emptyInputStream);
Assert.assertTrue(deserializedEmptyHll.getType() == Hll.HLL_DATA_EMPTY);
// test explicit
Hll explicitHll = new Hll();
for (int i = 0; i < Hll.HLL_EXPLICLIT_INT64_NUM; i++) {
explicitHll.updateWithHash(i);
}
Assert.assertTrue(explicitHll.getType() == Hll.HLL_DATA_EXPLICIT);
Assert.assertTrue(explicitHll.estimateCardinality() == Hll.HLL_EXPLICLIT_INT64_NUM);
ByteArrayOutputStream explicitOutputStream = new ByteArrayOutputStream();
DataOutput explicitOutput = new DataOutputStream(explicitOutputStream);
explicitHll.serialize(explicitOutput);
DataInputStream explicitInputStream = new DataInputStream(new ByteArrayInputStream(explicitOutputStream.toByteArray()));
Hll deserializedExplicitHll = new Hll();
deserializedExplicitHll.deserialize(explicitInputStream);
Assert.assertTrue(deserializedExplicitHll.getType() == Hll.HLL_DATA_EXPLICIT);
// test sparse
Hll sparseHll = new Hll();
for (int i = 0; i < Hll.HLL_SPARSE_THRESHOLD; i++) {
sparseHll.updateWithHash(i);
}
Assert.assertTrue(sparseHll.getType() == Hll.HLL_DATA_FULL);
// 2% error rate
Assert.assertTrue(sparseHll.estimateCardinality() > Hll.HLL_SPARSE_THRESHOLD * (1 - 0.02) &&
sparseHll.estimateCardinality() < Hll.HLL_SPARSE_THRESHOLD * (1 + 0.02));
ByteArrayOutputStream sparseOutputStream = new ByteArrayOutputStream();
DataOutput sparseOutput = new DataOutputStream(sparseOutputStream);
sparseHll.serialize(sparseOutput);
DataInputStream sparseInputStream = new DataInputStream(new ByteArrayInputStream(sparseOutputStream.toByteArray()));
Hll deserializedSparseHll = new Hll();
deserializedSparseHll.deserialize(sparseInputStream);
Assert.assertTrue(deserializedSparseHll.getType() == Hll.HLL_DATA_SPARSE);
Assert.assertTrue(sparseHll.estimateCardinality() == deserializedSparseHll.estimateCardinality());
// test full
Hll fullHll = new Hll();
for (int i = 1; i <= Short.MAX_VALUE; i++) {
fullHll.updateWithHash(i);
}
Assert.assertTrue(fullHll.getType() == Hll.HLL_DATA_FULL);
// the result 32748 is consistent with C++ 's implementation
Assert.assertTrue(fullHll.estimateCardinality() == 32748);
Assert.assertTrue(fullHll.estimateCardinality() > Short.MAX_VALUE * (1 - 0.02) &&
fullHll.estimateCardinality() < Short.MAX_VALUE * (1 + 0.02));
ByteArrayOutputStream fullHllOutputStream = new ByteArrayOutputStream();
DataOutput fullHllOutput = new DataOutputStream(fullHllOutputStream);
fullHll.serialize(fullHllOutput);
DataInputStream fullHllInputStream = new DataInputStream(new ByteArrayInputStream(fullHllOutputStream.toByteArray()));
Hll deserializedFullHll = new Hll();
deserializedFullHll.deserialize(fullHllInputStream);
Assert.assertTrue(deserializedFullHll.getType() == Hll.HLL_DATA_FULL);
Assert.assertTrue(deserializedFullHll.estimateCardinality() == fullHll.estimateCardinality());
}
// keep logic same with C++ version
// add additional compare logic with C++ version's estimateValue
@Test
public void testCompareEstimateValueWithBe() throws IOException {
//empty
{
Hll hll = new Hll();
long estimateValue = hll.estimateCardinality();
byte[] serializedByte = serializeHll(hll);
hll = deserializeHll(serializedByte);
Assert.assertTrue(estimateValue == hll.estimateCardinality());
}
// explicit [0. 100)
Hll explicitHll = new Hll();
{
for (int i = 0; i < 100; i++) {
explicitHll.updateWithHash(i);
}
Assert.assertTrue(explicitHll.estimateCardinality() == 100);
// check serialize
byte[] serializeHll = serializeHll(explicitHll);
explicitHll = deserializeHll(serializeHll);
Assert.assertTrue(explicitHll.estimateCardinality() == 100);
Hll otherHll = new Hll();
for (int i = 0; i < 100; i++) {
otherHll.updateWithHash(i);
}
explicitHll.merge(otherHll);
// compare with C++ version result
Assert.assertTrue(explicitHll.estimateCardinality() == 100);
}
// sparse [1024, 2048)
Hll sparseHll = new Hll();
{
for (int i = 0; i < 1024; i++) {
sparseHll.updateWithHash(i + 1024);
}
long preValue = sparseHll.estimateCardinality();
// check serialize
byte[] serializedHll = serializeHll(sparseHll);
Assert.assertTrue(serializedHll.length < Hll.HLL_REGISTERS_COUNT + 1);
sparseHll = deserializeHll(serializedHll);
Assert.assertTrue(sparseHll.estimateCardinality() == preValue);
Assert.assertTrue(sparseHll.getType() == Hll.HLL_DATA_SPARSE);
Hll otherHll = new Hll();
for (int i = 0; i < 1024; i++) {
otherHll.updateWithHash(i + 1024);
}
sparseHll.updateWithHash(1024);
sparseHll.merge(otherHll);
long cardinality = sparseHll.estimateCardinality();
Assert.assertTrue(preValue == cardinality);
// 2% error rate
Assert.assertTrue(cardinality > 1000 && cardinality < 1045);
// compare with C++ version result
Assert.assertTrue(cardinality == 1023);
}
// full [64 * 1024, 128 * 1024)
Hll fullHll = new Hll();
{
for (int i = 0; i < 64 * 1024; i++) {
fullHll.updateWithHash(64 * 1024 + i);
}
long preValue = fullHll.estimateCardinality();
// check serialize
byte[] serializedHll = serializeHll(fullHll);
fullHll = deserializeHll(serializedHll);
Assert.assertTrue(fullHll.estimateCardinality() == preValue);
Assert.assertTrue(serializedHll.length == Hll.HLL_REGISTERS_COUNT + 1);
// 2% error rate
Assert.assertTrue(preValue > 62 * 1024 && preValue < 66 * 1024);
// compare with C++ version result
Assert.assertTrue(preValue == 66112);
}
// merge explicit to empty_hll
{
Hll newExplicit = new Hll();
newExplicit.merge(explicitHll);
Assert.assertTrue(newExplicit.estimateCardinality() == 100);
// merge another explicit
{
Hll otherHll = new Hll();
for (int i = 100; i < 200; i++) {
otherHll.updateWithHash(i);
}
// this is converted to full
otherHll.merge(newExplicit);
Assert.assertTrue(otherHll.estimateCardinality() > 190);
// compare with C++ version result
Assert.assertTrue(otherHll.estimateCardinality() == 201);
}
// merge full
{
newExplicit.merge(fullHll);
Assert.assertTrue(newExplicit.estimateCardinality() > fullHll.estimateCardinality());
// compare with C++ version result
Assert.assertTrue(newExplicit.estimateCardinality() == 66250);
}
}
// merge sparse into empty
{
Hll newSparseHll = new Hll();
newSparseHll.merge(sparseHll);
Assert.assertTrue(sparseHll.estimateCardinality() == newSparseHll.estimateCardinality());
// compare with C++ version result
Assert.assertTrue(newSparseHll.estimateCardinality() == 1023);
// merge explicit
newSparseHll.merge(explicitHll);
Assert.assertTrue(newSparseHll.estimateCardinality() > sparseHll.estimateCardinality());
// compare with C++ version result
Assert.assertTrue(newSparseHll.estimateCardinality() == 1123);
// merge full
newSparseHll.merge(fullHll);
Assert.assertTrue(newSparseHll.estimateCardinality() > fullHll.estimateCardinality());
// compare with C++ version result
Assert.assertTrue(newSparseHll.estimateCardinality() == 67316);
}
}
private byte[] serializeHll(Hll hll) throws IOException {
ByteArrayOutputStream fullHllOutputStream = new ByteArrayOutputStream();
DataOutput fullHllOutput = new DataOutputStream(fullHllOutputStream);
hll.serialize(fullHllOutput);
return fullHllOutputStream.toByteArray();
}
private Hll deserializeHll(byte[] hllBytes) throws IOException {
DataInputStream fullHllInputStream = new DataInputStream(new ByteArrayInputStream(hllBytes));
Hll hll = new Hll();
hll.deserialize(fullHllInputStream);
return hll;
}
}