blob: 3ea73b9bd5c7e6408e80843cb65c8e19defab1b6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.filters.bloomfilter;
import static org.apache.datasketches.common.Util.LS;
import java.nio.charset.StandardCharsets;
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.SketchesStateException;
import org.apache.datasketches.memory.Buffer;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableBuffer;
import org.apache.datasketches.memory.WritableMemory;
import org.apache.datasketches.memory.XxHash;
/**
* <p>A Bloom filter is a data structure that can be used for probabilistic
* set membership.</p>
*
* <p>When querying a Bloom filter, there are no false positives. Specifically:
* When querying an item that has already been inserted to the filter, the filter will
* always indicate that the item is present. There is a chance of false positives, where
* querying an item that has never been presented to the filter will indicate that the
* item has already been seen. Consequently, any query should be interpreted as
* "might have seen."</p>
*
* <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear
* in size and does not resize itself. A Bloom filter will work up to a target number of
* distinct items, beyond which it will saturate and the false positive rate will start to
* increase. The size of a Bloom filter will be linear in the expected number of
* distinct items.</p>
*
* <p>See the BloomFilterBuilder class for methods to create a filter, especially
* one sized correctly for a target number of distinct elements and a target
* false positive probability.</p>
*
* <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher,
* "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p>
*/
public final class BloomFilter {
/**
* The maximum size of a bloom filter in bits.
*/
public static final long MAX_SIZE_BITS = (Integer.MAX_VALUE - Family.BLOOMFILTER.getMaxPreLongs()) * (long) Long.SIZE;
private static final int SER_VER = 1;
private static final int EMPTY_FLAG_MASK = 4;
private static final long BIT_ARRAY_OFFSET = 16;
private static final int FLAGS_BYTE = 3;
private final long seed_; // hash seed
private final short numHashes_; // number of hash values
private final BitArray bitArray_; // the actual data bits
private final WritableMemory wmem_; // used only for direct mode BitArray
/**
* Creates a BloomFilter with given number of bits and number of hash functions,
* and a user-specified seed.
*
* @param numBits The size of the BloomFilter, in bits
* @param numHashes The number of hash functions to apply to items
* @param seed The base hash seed
*/
BloomFilter(final long numBits, final int numHashes, final long seed) {
seed_ = seed;
numHashes_ = (short) numHashes;
bitArray_ = new HeapBitArray(numBits);
wmem_ = null;
}
/**
* Creates a BloomFilter with given number of bits and number of hash functions,
* and a user-specified seed in the provided WritableMemory
*
* @param numBits The size of the BloomFilter, in bits
* @param numHashes The number of hash functions to apply to items
* @param seed The base hash seed
* @param wmem A WritableMemory that will be initialized to hold the filter
*/
BloomFilter(final long numBits, final int numHashes, final long seed, final WritableMemory wmem) {
if (wmem.getCapacity() < Family.BLOOMFILTER.getMaxPreLongs()) {
throw new SketchesArgumentException("Provided WritableMemory capacity insufficient to initialize BloomFilter");
}
// we don't resize so initialize wizth non-empty preLongs value
// and no empty flag
final WritableBuffer wbuf = wmem.asWritableBuffer();
wbuf.putByte((byte) Family.BLOOMFILTER.getMaxPreLongs());
wbuf.putByte((byte) SER_VER);
wbuf.putByte((byte) Family.BLOOMFILTER.getID());
wbuf.putByte((byte) 0); // instead of (bitArray_.isEmpty() ? EMPTY_FLAG_MASK : 0);
wbuf.putShort((short) numHashes);
wbuf.putShort((short) 0); // unused
wbuf.putLong(seed);
seed_ = seed;
numHashes_ = (short) numHashes;
bitArray_ = DirectBitArray.initialize(numBits, wmem.writableRegion(BIT_ARRAY_OFFSET, wmem.getCapacity() - BIT_ARRAY_OFFSET));
wmem_ = wmem;
}
// Constructor used with internalHeapifyOrWrap()
BloomFilter(final short numHashes, final long seed, final BitArray bitArray, final WritableMemory wmem) {
seed_ = seed;
numHashes_ = numHashes;
bitArray_ = bitArray;
wmem_ = wmem;
}
/**
* Reads a serialized image of a BloomFilter from the provided Memory
* @param mem Memory containing a previously serialized BloomFilter
* @return a BloomFilter object
*/
public static BloomFilter heapify(final Memory mem) {
// casting to writable, but heapify so only reading
return internalHeapifyOrWrap((WritableMemory) mem, false, false);
}
/**
* Wraps the given Memory into this filter class. The class itself only contains a few metadata items and holds
* a reference to the Memory object, which contains all the data.
* @param mem the given Memory object
* @return the wrapping BloomFilter class.
*/
public static BloomFilter wrap(final Memory mem) {
// casting to writable, but tracking that the object is read-only
return internalHeapifyOrWrap((WritableMemory) mem, true, false);
}
/**
* Wraps the given WritableMemory into this filter class. The class itself only contains a few metadata items and holds
* a reference to the Memory object, which contains all the data.
* @param wmem the given WritableMemory object
* @return the wrapping BloomFilter class.
*/
public static BloomFilter writableWrap(final WritableMemory wmem) {
return internalHeapifyOrWrap(wmem, true, true);
}
private static BloomFilter internalHeapifyOrWrap(final WritableMemory wmem, final boolean isWrap, final boolean isWritable) {
final Buffer buf = wmem.asBuffer();
final int preLongs = buf.getByte();
final int serVer = buf.getByte();
final int familyID = buf.getByte();
final int flags = buf.getByte();
checkArgument(preLongs < Family.BLOOMFILTER.getMinPreLongs() || preLongs > Family.BLOOMFILTER.getMaxPreLongs(),
"Possible corruption: Incorrect number of preamble bytes specified in header");
checkArgument(serVer != SER_VER, "Possible corruption: Unrecognized serialization version: " + serVer);
checkArgument(familyID != Family.BLOOMFILTER.getID(), "Possible corruption: Incorrect FamilyID for bloom filter. Found: " + familyID);
final short numHashes = buf.getShort();
buf.getShort(); // unused
checkArgument(numHashes < 1, "Possible corruption: Need strictly positive number of hash functions. Found: " + numHashes);
final long seed = buf.getLong();
final boolean isEmpty = (flags & EMPTY_FLAG_MASK) != 0;
final BitArray bitArray;
if (isWrap) {
if (isWritable) {
bitArray = BitArray.writableWrap(wmem.writableRegion(BIT_ARRAY_OFFSET, wmem.getCapacity() - BIT_ARRAY_OFFSET), isEmpty);
} else {
bitArray = BitArray.wrap(wmem.region(BIT_ARRAY_OFFSET, wmem.getCapacity() - BIT_ARRAY_OFFSET), isEmpty);
}
return new BloomFilter(numHashes, seed, bitArray, wmem);
} else { // if heapify
bitArray = BitArray.heapify(buf, isEmpty);
return new BloomFilter(numHashes, seed, bitArray, null);
}
}
/**
* Resets the BloomFilter to an empty state
*/
public void reset() {
bitArray_.reset();
}
/**
* Checks if the BloomFilter has processed any items
* @return True if the BloomFilter is empty, otherwise False
*/
public boolean isEmpty() { return bitArray_.isEmpty(); }
/**
* Returns the number of bits in the BloomFilter that are set to 1.
* @return The number of bits in use in this filter
*/
public long getBitsUsed() { return bitArray_.getNumBitsSet(); }
/**
* Returns the total number of bits in the BloomFilter.
* @return The total size of the BloomFilter
*/
public long getCapacity() { return bitArray_.getCapacity(); }
/**
* Returns the configured number of hash functions for this BloomFilter
* @return The number of hash functions to apply to inputs
*/
public short getNumHashes() { return numHashes_; }
/**
* Returns the hash seed for this BloomFilter.
* @return The hash seed for this filter
*/
public long getSeed() { return seed_; }
/**
* Returns whether the filter has a backing Memory object
* @return true if backed by Memory, otherwise false
*/
public boolean hasMemory() { return wmem_ != null; }
/**
* Returns whether the filter is in read-only mode. That is possible
* only if there is a backing Memory in read-only mode.
* @return true if read-only, otherwise false
*/
public boolean isReadOnly() {
return wmem_ != null && bitArray_.isReadOnly();
}
/**
* Returns whether the filter is a direct (off-heap) or on-heap object.
* That is possible only if there is a backing Memory.
* @return true if using direct memory access, otherwise false
*/
public boolean isDirect() {
return wmem_ != null && bitArray_.isDirect();
}
/**
* Returns the percentage of all bits in the BloomFilter set to 1.
* @return the percentage of bits in the filter set to 1
*/
public double getFillPercentage() {
return (double) bitArray_.getNumBitsSet() / bitArray_.getCapacity();
}
// UPDATE METHODS
/**
* Updates the filter with the provided long value.
* @param item an item with which to update the filter
*/
public void update(final long item) {
final long h0 = XxHash.hashLong(item, seed_);
final long h1 = XxHash.hashLong(item, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided double value. The value is
* canonicalized (NaN and infinities) prior to updating.
* @param item an item with which to update the filter
*/
public void update(final double item) {
// canonicalize all NaN & +/- infinity forms
final long[] data = { Double.doubleToLongBits(item) };
final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided String.
* The string is converted to a byte array using UTF8 encoding.
*
* <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
* method and will generally be a little slower depending on the complexity of the UTF8 encoding.
* </p>
*
* @param item an item with which to update the filter
*/
public void update(final String item) {
if (item == null || item.isEmpty()) { return; }
final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided byte[].
* @param data an array with which to update the filter
*/
public void update(final byte[] data) {
if (data == null) { return; }
final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided char[].
* @param data an array with which to update the filter
*/
public void update(final char[] data) {
if (data == null) { return; }
final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided short[].
* @param data an array with which to update the filter
*/
public void update(final short[] data) {
if (data == null) { return; }
final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided int[].
* @param data an array with which to update the filter
*/
public void update(final int[] data) {
if (data == null) { return; }
final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the provided long[].
* @param data an array with which to update the filter
*/
public void update(final long[] data) {
if (data == null) { return; }
final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
updateInternal(h0, h1);
}
/**
* Updates the filter with the data in the provided Memory.
* @param mem a Memory object with which to update the filter
*/
public void update(final Memory mem) {
if (mem == null) { return; }
final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
updateInternal(h0, h1);
}
// Internal method to apply updates given pre-computed hashes
private void updateInternal(final long h0, final long h1) {
final long numBits = bitArray_.getCapacity();
for (int i = 1; i <= numHashes_; ++i) {
// right-shift to ensure non-negative value
final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
bitArray_.setBit(hashIndex);
}
}
// QUERY-AND-UPDATE METHODS
/**
* Updates the filter with the provided long and
* returns the result from quering that value prior to the update.
* @param item an item with which to update the filter
* @return The query result prior to applying the update
*/
public boolean queryAndUpdate(final long item) {
final long h0 = XxHash.hashLong(item, seed_);
final long h1 = XxHash.hashLong(item, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided double and
* returns the result from quering that value prior to the update.
* The double is canonicalized (NaN and +/- infinity) in the call.
* @param item an item with which to update the filter
* @return The query result prior to applying the update
*/
public boolean queryAndUpdate(final double item) {
// canonicalize all NaN & +/- infinity forms
final long[] data = { Double.doubleToLongBits(item) };
final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided String and
* returns the result from quering that value prior to the update.
* The string is converted to a byte array using UTF8 encoding.
*
* <p>Note: this will not produce the same output hash values as the {@link #queryAndUpdate(char[])}
* method and will generally be a little slower depending on the complexity of the UTF8 encoding.
* </p>
*
* @param item an item with which to update the filter
* @return The query result prior to applying the update, or false if item is null
*/
public boolean queryAndUpdate(final String item) {
if (item == null || item.isEmpty()) { return false; }
final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided byte[] and
* returns the result from quering that array prior to the update.
* @param data an array with which to update the filter
* @return The query result prior to applying the update, or false if data is null
*/
public boolean queryAndUpdate(final byte[] data) {
final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided char[] and
* returns the result from quering that array prior to the update.
* @param data an array with which to update the filter
* @return The query result prior to applying the update, or false if data is null
*/
public boolean queryAndUpdate(final char[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided short[] and
* returns the result from quering that array prior to the update.
* @param data an array with which to update the filter
* @return The query result prior to applying the update, or false if data is null
*/
public boolean queryAndUpdate(final short[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided int[] and
* returns the result from quering that array prior to the update.
* @param data an array with which to update the filter
* @return The query result prior to applying the update, or false if data is null
*/
public boolean queryAndUpdate(final int[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided long[] and
* returns the result from quering that array prior to the update.
* @param data an array with which to update the filter
* @return The query result prior to applying the update, or false if data is null
*/
public boolean queryAndUpdate(final long[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
return queryAndUpdateInternal(h0, h1);
}
/**
* Updates the filter with the provided Memory and
* returns the result from quering that Memory prior to the update.
* @param mem an array with which to update the filter
* @return The query result prior to applying the update, or false if mem is null
*/
public boolean queryAndUpdate(final Memory mem) {
if (mem == null) { return false; }
final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
return queryAndUpdateInternal(h0, h1);
}
// Internal query-and-update method given pre-computed hashes
private boolean queryAndUpdateInternal(final long h0, final long h1) {
final long numBits = bitArray_.getCapacity();
boolean valueAlreadyExists = true;
for (int i = 1; i <= numHashes_; ++i) {
final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
// returns old value of bit
valueAlreadyExists &= bitArray_.getAndSetBit(hashIndex);
}
return valueAlreadyExists;
}
// QUERY METHODS
/**
* Queries the filter with the provided long and returns whether the
* value <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param item an item with which to query the filter
* @return The result of querying the filter with the given item
*/
public boolean query(final long item) {
final long h0 = XxHash.hashLong(item, seed_);
final long h1 = XxHash.hashLong(item, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided double and returns whether the
* value <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible. Double values are
* canonicalized (NaN and +/- infinity) before querying.
* @param item an item with which to query the filter
* @return The result of querying the filter with the given item
*/
public boolean query(final double item) {
// canonicalize all NaN & +/- infinity forms
final long[] data = { Double.doubleToLongBits(item) };
final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided double and returns whether the
* value <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* The string is converted to a byte array using UTF8 encoding.
*
* <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
* method and will generally be a little slower depending on the complexity of the UTF8 encoding.
* </p>
*
* @param item an item with which to query the filter
* @return The result of querying the filter with the given item, or false if item is null
*/
public boolean query(final String item) {
if (item == null || item.isEmpty()) { return false; }
final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided byte[] and returns whether the
* array <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param data an array with which to query the filter
* @return The result of querying the filter with the given data, or false if data is null
*/
public boolean query(final byte[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided char[] and returns whether the
* array <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param data an array with which to query the filter
* @return The result of querying the filter with the given data, or false if data is null
*/
public boolean query(final char[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided short[] and returns whether the
* array <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param data an array with which to query the filter
* @return The result of querying the filter with the given data, or false if data is null
*/
public boolean query(final short[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided int[] and returns whether the
* array <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param data an array with which to query the filter
* @return The result of querying the filter with the given data, or false if data is null
*/
public boolean query(final int[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided long[] and returns whether the
* array <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param data an array with which to query the filter
* @return The result of querying the filter with the given data, or false if data is null
*/
public boolean query(final long[] data) {
if (data == null) { return false; }
final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
return queryInternal(h0, h1);
}
/**
* Queries the filter with the provided Memory and returns whether the
* data <em>might</em> have been seen previously. The filter's expected
* False Positive Probability determines the chances of a true result being
* a false positive. False negatives are never possible.
* @param mem a Memory array with which to query the filter
* @return The result of querying the filter with the given Memory, or false if data is null
*/
public boolean query(final Memory mem) {
if (mem == null) { return false; }
final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
return queryInternal(h0, h1);
}
// Internal method to query the filter given pre-computed hashes
private boolean queryInternal(final long h0, final long h1) {
final long numBits = bitArray_.getCapacity();
for (int i = 1; i <= numHashes_; ++i) {
final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
// returns old value of bit
if (!bitArray_.getBit(hashIndex)) {
return false;
}
}
return true;
}
// OTHER OPERATIONS
/**
* Unions two BloomFilters by applying a logical OR. The result will recognized
* any values seen by either filter (as well as false positives).
* @param other A BloomFilter to union with this one
*/
public void union(final BloomFilter other) {
if (other == null) { return; }
if (!isCompatible(other)) {
throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
}
bitArray_.union(other.bitArray_);
}
/**
* Intersects two BloomFilters by applying a logical AND. The result will recognize
* only values seen by both filters (as well as false positives).
* @param other A BloomFilter to union with this one
*/
public void intersect(final BloomFilter other) {
if (other == null) { return; }
if (!isCompatible(other)) {
throw new SketchesArgumentException("Cannot union sketches with different seeds, hash functions, or sizes");
}
bitArray_.intersect(other.bitArray_);
}
/**
* Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
*/
public void invert() {
bitArray_.invert();
}
/**
* Helps identify if two BloomFilters may be unioned or intersected.
* @param other A BloomFilter to check for compatibility with this one
* @return True if the filters are compatible, otherwise false
*/
public boolean isCompatible(final BloomFilter other) {
if (other == null
|| seed_ != other.seed_
|| numHashes_ != other.numHashes_
|| bitArray_.getArrayLength() != other.bitArray_.getArrayLength()) {
return false;
}
return true;
}
/**
* Returns the length of this BloomFilter when serialized, in bytes
* @return The length of this BloomFilter when serialized, in bytes
*/
public long getSerializedSizeBytes() {
long sizeBytes = 2L * Long.BYTES; // basic sketch info + baseSeed
sizeBytes += bitArray_.getSerializedSizeBytes();
return sizeBytes;
}
/**
* Returns the serialized length of a non-empty BloomFilter of the given size, in bytes
* @param numBits The number of bits of to use for size computation
* @return The serialized length of a non-empty BloomFilter of the given size, in bytes
*/
public static long getSerializedSize(final long numBits) {
return (2L * Long.BYTES) + BitArray.getSerializedSizeBytes(numBits);
}
/*
* A Bloom Filter's serialized image always uses 3 longs of preamble when empty,
* otherwise 4 longs:
*
* <pre>
* Long || Start Byte Adr:
* Adr:
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
* 0 || Preamble_Longs | SerVer | FamID | Flags |----Num Hashes---|-----Unused------|
*
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
* 1 ||---------------------------------Hash Seed-------------------------------------|
*
* || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
* 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------|
*
* || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
* 3 ||---------------------------------NumBitsSet------------------------------------|
* </pre>
*
* The raw BitArray bits, if non-empty start at byte 24.
*/
/**
* Serializes the current BloomFilter to an array of bytes.
*
* <p>Note: Method throws if the serialized size exceeds <code>Integer.MAX_VALUE</code>.</p>
* @return A serialized image of the current BloomFilter as byte[]
*/
public byte[] toByteArray() {
final long sizeBytes = getSerializedSizeBytes();
if (sizeBytes > Integer.MAX_VALUE) {
throw new SketchesStateException("Cannot serialize a BloomFilter of this size using toByteArray(); use toLongArray() instead.");
}
final byte[] bytes = new byte[(int) sizeBytes];
if (wmem_ == null) {
final WritableBuffer wbuf = WritableMemory.writableWrap(bytes).asWritableBuffer();
final int numPreLongs = isEmpty() ? Family.BLOOMFILTER.getMinPreLongs() : Family.BLOOMFILTER.getMaxPreLongs();
wbuf.putByte((byte) numPreLongs);
wbuf.putByte((byte) SER_VER);
wbuf.putByte((byte) Family.BLOOMFILTER.getID());
wbuf.putByte((byte) (bitArray_.isEmpty() ? EMPTY_FLAG_MASK : 0));
wbuf.putShort(numHashes_);
wbuf.putShort((short) 0); // unused
wbuf.putLong(seed_);
((HeapBitArray) bitArray_).writeToBuffer(wbuf);
} else {
wmem_.getByteArray(0, bytes, 0, (int) sizeBytes);
if (isEmpty()) {
bytes[FLAGS_BYTE] |= EMPTY_FLAG_MASK;
}
}
return bytes;
}
/**
* Serializes the current BloomFilter to an array of longs. Unlike {@link #toByteArray()},
* this method can handle any size filter.
*
* @return A serialized image of the current BloomFilter as long[]
*/
public long[] toLongArray() {
final long sizeBytes = getSerializedSizeBytes();
final long[] longs = new long[(int) (sizeBytes >> 3)];
if (wmem_ == null) {
final WritableBuffer wbuf = WritableMemory.writableWrap(longs).asWritableBuffer();
final int numPreLongs = isEmpty() ? Family.BLOOMFILTER.getMinPreLongs() : Family.BLOOMFILTER.getMaxPreLongs();
wbuf.putByte((byte) numPreLongs);
wbuf.putByte((byte) SER_VER); // to do: add constant
wbuf.putByte((byte) Family.BLOOMFILTER.getID());
wbuf.putByte((byte) (bitArray_.isEmpty() ? EMPTY_FLAG_MASK : 0));
wbuf.putShort(numHashes_);
wbuf.putShort((short) 0); // unused
wbuf.putLong(seed_);
((HeapBitArray) bitArray_).writeToBuffer(wbuf);
} else {
wmem_.getLongArray(0, longs, 0, (int) (sizeBytes >>> 3));
if (isEmpty()) {
longs[0] |= (EMPTY_FLAG_MASK << (FLAGS_BYTE << 3));
}
}
return longs;
}
// Throws an exception with the provided message if the given condition is false
private static void checkArgument(final boolean condition, final String message) {
if (condition) { throw new SketchesArgumentException(message); }
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(LS);
final String thisSimpleName = this.getClass().getSimpleName();
sb.append("### ").append(thisSimpleName).append(" SUMMARY: ").append(LS);
sb.append(" numBits : ").append(bitArray_.getCapacity()).append(LS);
sb.append(" numHashes : ").append(numHashes_).append(LS);
sb.append(" seed : ").append(seed_).append(LS);
sb.append(" bitsUsed : ").append(bitArray_.getNumBitsSet()).append(LS);
sb.append(" fill % : ").append(getFillPercentage()).append(LS);
sb.append("### END SKETCH SUMMARY").append(LS);
return sb.toString();
}
}