blob: dd46119dee77b0149dfc07aebde6720fec974043 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.nio.ByteBuffer;
import junit.framework.TestCase;
import org.apache.hadoop.hbase.nio.MultiByteBuff;
import org.apache.hadoop.hbase.testclassification.MiscTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.junit.experimental.categories.Category;
@Category({MiscTests.class, SmallTests.class})
public class TestBloomFilterChunk extends TestCase {
public void testBasicBloom() throws Exception {
BloomFilterChunk bf1 = new BloomFilterChunk(1000, (float)0.01, Hash.MURMUR_HASH, 0);
BloomFilterChunk bf2 = new BloomFilterChunk(1000, (float)0.01, Hash.MURMUR_HASH, 0);
bf1.allocBloom();
bf2.allocBloom();
// test 1: verify no fundamental false negatives or positives
byte[] key1 = {1,2,3,4,5,6,7,8,9};
byte[] key2 = {1,2,3,4,5,6,7,8,7};
bf1.add(key1);
bf2.add(key2);
assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf1.bloom), 0,
(int) bf1.byteSize, bf1.hash, bf1.hashCount));
assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf1.bloom), 0,
(int) bf1.byteSize, bf1.hash, bf1.hashCount));
assertFalse(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bf2.bloom), 0,
(int) bf2.byteSize, bf2.hash, bf2.hashCount));
assertTrue(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bf2.bloom), 0,
(int) bf2.byteSize, bf2.hash, bf2.hashCount));
byte [] bkey = {1,2,3,4};
byte [] bval = "this is a much larger byte array".getBytes();
bf1.add(bkey);
bf1.add(bval, 1, bval.length-1);
assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bf1.bloom), 0,
(int) bf1.byteSize, bf1.hash, bf1.hashCount));
assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bf1.bloom),
0, (int) bf1.byteSize, bf1.hash, bf1.hashCount));
assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bf1.bloom), 0,
(int) bf1.byteSize, bf1.hash, bf1.hashCount));
// test 2: serialization & deserialization.
// (convert bloom to byte array & read byte array back in as input)
ByteArrayOutputStream bOut = new ByteArrayOutputStream();
bf1.writeBloom(new DataOutputStream(bOut));
ByteBuffer bb = ByteBuffer.wrap(bOut.toByteArray());
BloomFilterChunk newBf1 = new BloomFilterChunk(1000, (float)0.01,
Hash.MURMUR_HASH, 0);
assertTrue(BloomFilterUtil.contains(key1, 0, key1.length, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
assertFalse(BloomFilterUtil.contains(key2, 0, key2.length, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
assertTrue(BloomFilterUtil.contains(bkey, 0, bkey.length, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
assertTrue(BloomFilterUtil.contains(bval, 1, bval.length - 1, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
assertFalse(BloomFilterUtil.contains(bval, 0, bval.length, new MultiByteBuff(bb), 0,
(int) newBf1.byteSize, newBf1.hash, newBf1.hashCount));
System.out.println("Serialized as " + bOut.size() + " bytes");
assertTrue(bOut.size() - bf1.byteSize < 10); //... allow small padding
}
public void testBloomFold() throws Exception {
// test: foldFactor < log(max/actual)
BloomFilterChunk b = new BloomFilterChunk(1003, (float) 0.01,
Hash.MURMUR_HASH, 2);
b.allocBloom();
long origSize = b.getByteSize();
assertEquals(1204, origSize);
for (int i = 0; i < 12; ++i) {
b.add(Bytes.toBytes(i));
}
b.compactBloom();
assertEquals(origSize>>2, b.getByteSize());
int falsePositives = 0;
for (int i = 0; i < 25; ++i) {
byte[] bytes = Bytes.toBytes(i);
if (BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0,
(int) b.byteSize, b.hash, b.hashCount)) {
if (i >= 12)
falsePositives++;
} else {
assertFalse(i < 12);
}
}
assertTrue(falsePositives <= 1);
// test: foldFactor > log(max/actual)
}
public void testBloomPerf() throws Exception {
// add
float err = (float)0.01;
BloomFilterChunk b = new BloomFilterChunk(10*1000*1000, (float)err, Hash.MURMUR_HASH, 3);
b.allocBloom();
long startTime = System.currentTimeMillis();
long origSize = b.getByteSize();
for (int i = 0; i < 1*1000*1000; ++i) {
b.add(Bytes.toBytes(i));
}
long endTime = System.currentTimeMillis();
System.out.println("Total Add time = " + (endTime - startTime) + "ms");
// fold
startTime = System.currentTimeMillis();
b.compactBloom();
endTime = System.currentTimeMillis();
System.out.println("Total Fold time = " + (endTime - startTime) + "ms");
assertTrue(origSize >= b.getByteSize()<<3);
// test
startTime = System.currentTimeMillis();
int falsePositives = 0;
for (int i = 0; i < 2*1000*1000; ++i) {
byte[] bytes = Bytes.toBytes(i);
if (BloomFilterUtil.contains(bytes, 0, bytes.length, new MultiByteBuff(b.bloom), 0,
(int) b.byteSize, b.hash, b.hashCount)) {
if (i >= 1 * 1000 * 1000)
falsePositives++;
} else {
assertFalse(i < 1*1000*1000);
}
}
endTime = System.currentTimeMillis();
System.out.println("Total Contains time = " + (endTime - startTime) + "ms");
System.out.println("False Positive = " + falsePositives);
assertTrue(falsePositives <= (1*1000*1000)*err);
// test: foldFactor > log(max/actual)
}
public void testSizing() {
int bitSize = 8 * 128 * 1024; // 128 KB
double errorRate = 0.025; // target false positive rate
// How many keys can we store in a Bloom filter of this size maintaining
// the given false positive rate, not taking into account that the n
long maxKeys = BloomFilterUtil.idealMaxKeys(bitSize, errorRate);
assertEquals(136570, maxKeys);
// A reverse operation: how many bits would we need to store this many keys
// and keep the same low false positive rate?
long bitSize2 = BloomFilterUtil.computeBitSize(maxKeys, errorRate);
// The bit size comes out a little different due to rounding.
assertTrue(Math.abs(bitSize2 - bitSize) * 1.0 / bitSize < 1e-5);
}
public void testFoldableByteSize() {
assertEquals(128, BloomFilterUtil.computeFoldableByteSize(1000, 5));
assertEquals(640, BloomFilterUtil.computeFoldableByteSize(5001, 4));
}
}