blob: 174843e8b6de0dee931595f862e8859e007820f3 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.io.hfile.RandomKeyValueUtil;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.CompoundBloomFilter;
import org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.hbase.util.BloomFilterFactory;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.BloomFilterUtil;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Tests writing Bloom filter blocks in the same part of the file as data
* blocks.
*/
@Category({RegionServerTests.class, MediumTests.class})
public class TestCompoundBloomFilter {
private static final HBaseTestingUtility TEST_UTIL =
new HBaseTestingUtility();
private static final Log LOG = LogFactory.getLog(
TestCompoundBloomFilter.class);
private static final int NUM_TESTS = 9;
private static final BloomType BLOOM_TYPES[] = { BloomType.ROW,
BloomType.ROW, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW,
BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW };
private static final int NUM_KV[];
static {
final int N = 10000; // Only used in initialization.
NUM_KV = new int[] { 21870, N, N, N, N, 1000, N, 7500, 7500};
assert NUM_KV.length == NUM_TESTS;
}
private static final int BLOCK_SIZES[];
static {
final int blkSize = 65536;
BLOCK_SIZES = new int[] { 512, 1000, blkSize, blkSize, blkSize, 128, 300,
blkSize, blkSize };
assert BLOCK_SIZES.length == NUM_TESTS;
}
/**
* Be careful not to specify too high a Bloom filter block size, otherwise
* there will only be one oversized chunk and the observed false positive
* rate will be too low.
*/
private static final int BLOOM_BLOCK_SIZES[] = { 1000, 4096, 4096, 4096,
8192, 128, 1024, 600, 600 };
static { assert BLOOM_BLOCK_SIZES.length == NUM_TESTS; }
private static final double TARGET_ERROR_RATES[] = { 0.025, 0.01, 0.015,
0.01, 0.03, 0.01, 0.01, 0.07, 0.07 };
static { assert TARGET_ERROR_RATES.length == NUM_TESTS; }
/** A false positive rate that is obviously too high. */
private static final double TOO_HIGH_ERROR_RATE;
static {
double m = 0;
for (double errorRate : TARGET_ERROR_RATES)
m = Math.max(m, errorRate);
TOO_HIGH_ERROR_RATE = m + 0.03;
}
private static Configuration conf;
private static CacheConfig cacheConf;
private FileSystem fs;
private BlockCache blockCache;
/** A message of the form "in test#<number>:" to include in logging. */
private String testIdMsg;
private static final int GENERATION_SEED = 2319;
private static final int EVALUATION_SEED = 135;
@Before
public void setUp() throws IOException {
conf = TEST_UTIL.getConfiguration();
// This test requires the most recent HFile format (i.e. v2).
conf.setInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION);
fs = FileSystem.get(conf);
cacheConf = new CacheConfig(conf);
blockCache = cacheConf.getBlockCache();
assertNotNull(blockCache);
}
private List<KeyValue> createSortedKeyValues(Random rand, int n) {
List<KeyValue> kvList = new ArrayList<KeyValue>(n);
for (int i = 0; i < n; ++i)
kvList.add(RandomKeyValueUtil.randomKeyValue(rand));
Collections.sort(kvList, CellComparator.COMPARATOR);
return kvList;
}
@Test
public void testCompoundBloomFilter() throws IOException {
conf.setBoolean(BloomFilterFactory.IO_STOREFILE_BLOOM_ENABLED, true);
for (int t = 0; t < NUM_TESTS; ++t) {
conf.setFloat(BloomFilterFactory.IO_STOREFILE_BLOOM_ERROR_RATE,
(float) TARGET_ERROR_RATES[t]);
testIdMsg = "in test #" + t + ":";
Random generationRand = new Random(GENERATION_SEED);
List<KeyValue> kvs = createSortedKeyValues(generationRand, NUM_KV[t]);
BloomType bt = BLOOM_TYPES[t];
Path sfPath = writeStoreFile(t, bt, kvs);
readStoreFile(t, bt, kvs, sfPath);
}
}
/**
* Validates the false positive ratio by computing its z-value and comparing
* it to the provided threshold.
*
* @param falsePosRate experimental positive rate
* @param nTrials the number of Bloom filter checks
* @param zValueBoundary z-value boundary, positive for an upper bound and
* negative for a lower bound
* @param cbf the compound Bloom filter we are using
* @param additionalMsg additional message to include in log output and
* assertion failures
*/
private void validateFalsePosRate(double falsePosRate, int nTrials,
double zValueBoundary, CompoundBloomFilter cbf, String additionalMsg) {
double p = BloomFilterFactory.getErrorRate(conf);
double zValue = (falsePosRate - p) / Math.sqrt(p * (1 - p) / nTrials);
String assortedStatsStr = " (targetErrorRate=" + p + ", falsePosRate="
+ falsePosRate + ", nTrials=" + nTrials + ")";
LOG.info("z-value is " + zValue + assortedStatsStr);
boolean isUpperBound = zValueBoundary > 0;
if (isUpperBound && zValue > zValueBoundary ||
!isUpperBound && zValue < zValueBoundary) {
String errorMsg = "False positive rate z-value " + zValue + " is "
+ (isUpperBound ? "higher" : "lower") + " than " + zValueBoundary
+ assortedStatsStr + ". Per-chunk stats:\n"
+ cbf.formatTestingStats();
fail(errorMsg + additionalMsg);
}
}
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs,
Path sfPath) throws IOException {
StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt);
StoreFileReader r = sf.createReader();
final boolean pread = true; // does not really matter
StoreFileScanner scanner = r.getStoreFileScanner(true, pread, false, 0, 0, false);
{
// Test for false negatives (not allowed).
int numChecked = 0;
for (KeyValue kv : kvs) {
byte[] row = CellUtil.cloneRow(kv);
boolean present = isInBloom(scanner, row, CellUtil.cloneQualifier(kv));
assertTrue(testIdMsg + " Bloom filter false negative on row "
+ Bytes.toStringBinary(row) + " after " + numChecked
+ " successful checks", present);
++numChecked;
}
}
// Test for false positives (some percentage allowed). We test in two modes:
// "fake lookup" which ignores the key distribution, and production mode.
for (boolean fakeLookupEnabled : new boolean[] { true, false }) {
BloomFilterUtil.setFakeLookupMode(fakeLookupEnabled);
try {
String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ?
"enabled" : "disabled");
CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter();
cbf.enableTestingStats();
int numFalsePos = 0;
Random rand = new Random(EVALUATION_SEED);
int nTrials = NUM_KV[t] * 10;
for (int i = 0; i < nTrials; ++i) {
byte[] query = RandomKeyValueUtil.randomRowOrQualifier(rand);
if (isInBloom(scanner, query, bt, rand)) {
numFalsePos += 1;
}
}
double falsePosRate = numFalsePos * 1.0 / nTrials;
LOG.debug(String.format(testIdMsg
+ " False positives: %d out of %d (%f)",
numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr);
// Check for obvious Bloom filter crashes.
assertTrue("False positive is too high: " + falsePosRate + " (greater "
+ "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr,
falsePosRate < TOO_HIGH_ERROR_RATE);
// Now a more precise check to see if the false positive rate is not
// too high. The reason we use a relaxed restriction for the real-world
// case as opposed to the "fake lookup" case is that our hash functions
// are not completely independent.
double maxZValue = fakeLookupEnabled ? 1.96 : 2.5;
validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf,
fakeLookupModeStr);
// For checking the lower bound we need to eliminate the last chunk,
// because it is frequently smaller and the false positive rate in it
// is too low. This does not help if there is only one under-sized
// chunk, though.
int nChunks = cbf.getNumChunks();
if (nChunks > 1) {
numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1);
nTrials -= cbf.getNumQueriesForTesting(nChunks - 1);
falsePosRate = numFalsePos * 1.0 / nTrials;
LOG.info(testIdMsg + " False positive rate without last chunk is " +
falsePosRate + fakeLookupModeStr);
}
validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf,
fakeLookupModeStr);
} finally {
BloomFilterUtil.setFakeLookupMode(false);
}
}
r.close(true); // end of test so evictOnClose
}
private boolean isInBloom(StoreFileScanner scanner, byte[] row, BloomType bt,
Random rand) {
return isInBloom(scanner, row, RandomKeyValueUtil.randomRowOrQualifier(rand));
}
private boolean isInBloom(StoreFileScanner scanner, byte[] row,
byte[] qualifier) {
Scan scan = new Scan(row, row);
scan.addColumn(Bytes.toBytes(RandomKeyValueUtil.COLUMN_FAMILY_NAME), qualifier);
Store store = mock(Store.class);
HColumnDescriptor hcd = mock(HColumnDescriptor.class);
when(hcd.getName()).thenReturn(Bytes.toBytes(RandomKeyValueUtil.COLUMN_FAMILY_NAME));
when(store.getFamily()).thenReturn(hcd);
return scanner.shouldUseScanner(scan, store, Long.MIN_VALUE);
}
private Path writeStoreFile(int t, BloomType bt, List<KeyValue> kvs)
throws IOException {
conf.setInt(BloomFilterFactory.IO_STOREFILE_BLOOM_BLOCK_SIZE,
BLOOM_BLOCK_SIZES[t]);
conf.setBoolean(CacheConfig.CACHE_BLOCKS_ON_WRITE_KEY, true);
cacheConf = new CacheConfig(conf);
HFileContext meta = new HFileContextBuilder().withBlockSize(BLOCK_SIZES[t]).build();
StoreFileWriter w = new StoreFileWriter.Builder(conf, cacheConf, fs)
.withOutputDir(TEST_UTIL.getDataTestDir())
.withBloomType(bt)
.withFileContext(meta)
.build();
assertTrue(w.hasGeneralBloom());
assertTrue(w.getGeneralBloomWriter() instanceof CompoundBloomFilterWriter);
CompoundBloomFilterWriter cbbf =
(CompoundBloomFilterWriter) w.getGeneralBloomWriter();
int keyCount = 0;
KeyValue prev = null;
LOG.debug("Total keys/values to insert: " + kvs.size());
for (KeyValue kv : kvs) {
w.append(kv);
// Validate the key count in the Bloom filter.
boolean newKey = true;
if (prev != null) {
newKey = !(bt == BloomType.ROW ? CellUtil.matchingRows(kv,
prev) : CellUtil.matchingRowColumn(kv, prev));
}
if (newKey)
++keyCount;
assertEquals(keyCount, cbbf.getKeyCount());
prev = kv;
}
w.close();
return w.getPath();
}
@Test
public void testCompoundBloomSizing() {
int bloomBlockByteSize = 4096;
int bloomBlockBitSize = bloomBlockByteSize * 8;
double targetErrorRate = 0.01;
long maxKeysPerChunk = BloomFilterUtil.idealMaxKeys(bloomBlockBitSize,
targetErrorRate);
long bloomSize1 = bloomBlockByteSize * 8;
long bloomSize2 = BloomFilterUtil.computeBitSize(maxKeysPerChunk,
targetErrorRate);
double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1);
assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001);
}
@Test
public void testCreateKey() {
byte[] row = "myRow".getBytes();
byte[] qualifier = "myQualifier".getBytes();
// Mimic what Storefile.createBloomKeyValue() does
byte[] rowKey = KeyValueUtil.createFirstOnRow(row, 0, row.length, new byte[0], 0, 0, row, 0, 0).getKey();
byte[] rowColKey = KeyValueUtil.createFirstOnRow(row, 0, row.length,
new byte[0], 0, 0, qualifier, 0, qualifier.length).getKey();
KeyValue rowKV = KeyValueUtil.createKeyValueFromKey(rowKey);
KeyValue rowColKV = KeyValueUtil.createKeyValueFromKey(rowColKey);
assertEquals(rowKV.getTimestamp(), rowColKV.getTimestamp());
assertEquals(Bytes.toStringBinary(rowKV.getRowArray(), rowKV.getRowOffset(),
rowKV.getRowLength()), Bytes.toStringBinary(rowColKV.getRowArray(), rowColKV.getRowOffset(),
rowColKV.getRowLength()));
assertEquals(0, rowKV.getQualifierLength());
}
}