| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.bloom; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * <p> |
| * A class used to represent a set of many, potentially large, values (e.g. many |
| * long strings such as URLs), using a significantly smaller amount of memory. |
| * </p> |
| * <p> |
| * The set is "lossy" in that it cannot definitively state that is does contain |
| * a value but it <em>can</em> definitively say if a value is <em>not</em> in |
| * the set. It can therefore be used as a Bloom Filter. |
| * </p> |
| * Another application of the set is that it can be used to perform fuzzy counting because |
| * it can estimate reasonably accurately how many unique values are contained in the set. |
| * <p>This class is NOT threadsafe.</p> |
| * <p> |
| * Internally a Bitset is used to record values and once a client has finished recording |
| * a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that |
| * is sized appropriately for the number of values recorded and desired saturation levels. |
| * |
| * </p> |
| * @lucene.experimental |
| */ |
| public class FuzzySet implements Accountable { |
| |
| public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI |
| public static final int VERSION_START = VERSION_SPI; |
| public static final int VERSION_CURRENT = 2; |
| |
| public static HashFunction hashFunctionForVersion(int version) { |
| if (version < VERSION_START) { |
| throw new IllegalArgumentException("Version " + version + " is too old, expected at least " + VERSION_START); |
| } else if (version > VERSION_CURRENT) { |
| throw new IllegalArgumentException("Version " + version + " is too new, expected at most " + VERSION_CURRENT); |
| } |
| return MurmurHash2.INSTANCE; |
| } |
| |
| /** |
| * Result from {@link FuzzySet#contains(BytesRef)}: |
| * can never return definitively YES (always MAYBE), |
| * but can sometimes definitely return NO. |
| */ |
| public enum ContainsResult { |
| MAYBE, NO |
| }; |
| private HashFunction hashFunction; |
| private FixedBitSet filter; |
| private int bloomSize; |
| |
| //The sizes of BitSet used are all numbers that, when expressed in binary form, |
| //are all ones. This is to enable fast downsizing from one bitset to another |
| //by simply ANDing each set index in one bitset with the size of the target bitset |
| // - this provides a fast modulo of the number. Values previously accumulated in |
| // a large bitset and then mapped to a smaller set can be looked up using a single |
| // AND operation of the query term's hash rather than needing to perform a 2-step |
| // translation of the query term that mirrors the stored content's reprojections. |
| static final int usableBitSetSizes[]; |
| static |
| { |
| usableBitSetSizes=new int[30]; |
| int mask=1; |
| int size=mask; |
| for (int i = 0; i < usableBitSetSizes.length; i++) { |
| size=(size<<1)|mask; |
| usableBitSetSizes[i]=size; |
| } |
| } |
| |
| /** |
| * Rounds down required maxNumberOfBits to the nearest number that is made up |
| * of all ones as a binary number. |
| * Use this method where controlling memory use is paramount. |
| */ |
| public static int getNearestSetSize(int maxNumberOfBits) |
| { |
| int result=usableBitSetSizes[0]; |
| for (int i = 0; i < usableBitSetSizes.length; i++) { |
| if(usableBitSetSizes[i]<=maxNumberOfBits) |
| { |
| result=usableBitSetSizes[i]; |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Use this method to choose a set size where accuracy (low content saturation) is more important |
| * than deciding how much memory to throw at the problem. |
| * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded |
| * @return The size of the set nearest to the required size |
| */ |
| public static int getNearestSetSize(int maxNumberOfValuesExpected, |
| float desiredSaturation) { |
| // Iterate around the various scales of bitset from smallest to largest looking for the first that |
| // satisfies value volumes at the chosen saturation level |
| for (int i = 0; i < usableBitSetSizes.length; i++) { |
| int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation); |
| int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions( |
| usableBitSetSizes[i], numSetBitsAtDesiredSaturation); |
| if (estimatedNumUniqueValues > maxNumberOfValuesExpected) { |
| return usableBitSetSizes[i]; |
| } |
| } |
| return -1; |
| } |
| |
| public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes) |
| { |
| int setSize=getNearestSetSize(maxNumBytes); |
| return new FuzzySet(new FixedBitSet(setSize+1),setSize, hashFunctionForVersion(VERSION_CURRENT)); |
| } |
| |
| public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation) |
| { |
| int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation); |
| return new FuzzySet(new FixedBitSet(setSize+1),setSize, hashFunctionForVersion(VERSION_CURRENT)); |
| } |
| |
| |
| |
| |
| private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) { |
| super(); |
| this.filter = filter; |
| this.bloomSize = bloomSize; |
| this.hashFunction=hashFunction; |
| } |
| |
| /** |
| * The main method required for a Bloom filter which, given a value determines set membership. |
| * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false. |
| * @return NO or MAYBE |
| */ |
| public ContainsResult contains(BytesRef value) { |
| int hash = hashFunction.hash(value); |
| if (hash < 0) { |
| hash = hash * -1; |
| } |
| return mayContainValue(hash); |
| } |
| |
| /** |
| * Serializes the data set to file using the following format: |
| * <ul> |
| * <li>FuzzySet -->FuzzySetVersion,HashFunctionName,BloomSize, |
| * NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li> |
| * <li>HashFunctionName --> {@link DataOutput#writeString(String) String} The |
| * name of a ServiceProvider registered {@link HashFunction}</li> |
| * <li>FuzzySetVersion --> {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li> |
| * <li>BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used |
| * to project hashes into the field's Bitset</li> |
| * <li>NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of |
| * longs (as returned from {@link FixedBitSet#getBits})</li> |
| * <li>BitSetWord --> {@link DataOutput#writeLong Long} A long from the array |
| * returned by {@link FixedBitSet#getBits}</li> |
| * </ul> |
| * @param out Data output stream |
| * @throws IOException If there is a low-level I/O error |
| */ |
| public void serialize(DataOutput out) throws IOException |
| { |
| out.writeInt(VERSION_CURRENT); |
| out.writeInt(bloomSize); |
| long[] bits = filter.getBits(); |
| out.writeInt(bits.length); |
| for (int i = 0; i < bits.length; i++) { |
| // Can't used VLong encoding because cant cope with negative numbers |
| // output by FixedBitSet |
| out.writeLong(bits[i]); |
| } |
| } |
| public static FuzzySet deserialize(DataInput in) throws IOException |
| { |
| int version=in.readInt(); |
| if (version == VERSION_SPI) { |
| in.readString(); |
| } |
| final HashFunction hashFunction = hashFunctionForVersion(version); |
| int bloomSize=in.readInt(); |
| int numLongs=in.readInt(); |
| long[]longs=new long[numLongs]; |
| for (int i = 0; i < numLongs; i++) { |
| longs[i]=in.readLong(); |
| } |
| FixedBitSet bits = new FixedBitSet(longs,bloomSize+1); |
| return new FuzzySet(bits,bloomSize,hashFunction); |
| } |
| |
| private ContainsResult mayContainValue(int positiveHash) { |
| assert positiveHash >= 0; |
| // Bloom sizes are always base 2 and so can be ANDed for a fast modulo |
| int pos = positiveHash & bloomSize; |
| if (filter.get(pos)) { |
| // This term may be recorded in this index (but could be a collision) |
| return ContainsResult.MAYBE; |
| } |
| // definitely NOT in this segment |
| return ContainsResult.NO; |
| } |
| |
| /** |
| * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the |
| * chosen size of the internal bitset. |
| * @param value the key value to be hashed |
| * @throws IOException If there is a low-level I/O error |
| */ |
| public void addValue(BytesRef value) throws IOException { |
| int hash = hashFunction.hash(value); |
| if (hash < 0) { |
| hash = hash * -1; |
| } |
| // Bitmasking using bloomSize is effectively a modulo operation. |
| int bloomPos = hash & bloomSize; |
| filter.set(bloomPos); |
| } |
| |
| |
| /** |
| * |
| * @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the |
| * result. Lower values have better accuracy but require more space. |
| * @return a smaller FuzzySet or null if the current set is already over-saturated |
| */ |
| public FuzzySet downsize(float targetMaxSaturation) |
| { |
| int numBitsSet = filter.cardinality(); |
| FixedBitSet rightSizedBitSet = filter; |
| int rightSizedBitSetSize = bloomSize; |
| //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level |
| for (int i = 0; i < usableBitSetSizes.length; i++) { |
| int candidateBitsetSize = usableBitSetSizes[i]; |
| float candidateSaturation = (float) numBitsSet |
| / (float) candidateBitsetSize; |
| if (candidateSaturation <= targetMaxSaturation) { |
| rightSizedBitSetSize = candidateBitsetSize; |
| break; |
| } |
| } |
| // Re-project the numbers to a smaller space if necessary |
| if (rightSizedBitSetSize < bloomSize) { |
| // Reset the choice of bitset to the smaller version |
| rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1); |
| // Map across the bits from the large set to the smaller one |
| int bitIndex = 0; |
| do { |
| bitIndex = filter.nextSetBit(bitIndex); |
| if (bitIndex != DocIdSetIterator.NO_MORE_DOCS) { |
| // Project the larger number into a smaller one effectively |
| // modulo-ing by using the target bitset size as a mask |
| int downSizedBitIndex = bitIndex & rightSizedBitSetSize; |
| rightSizedBitSet.set(downSizedBitIndex); |
| bitIndex++; |
| } |
| } while ( (bitIndex >= 0)&&(bitIndex<=bloomSize)); |
| } else { |
| return null; |
| } |
| return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction); |
| } |
| |
| public int getEstimatedUniqueValues() |
| { |
| return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality()); |
| } |
| |
| // Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded |
| public static int getEstimatedNumberUniqueValuesAllowingForCollisions( |
| int setSize, int numRecordedBits) { |
| double setSizeAsDouble = setSize; |
| double numRecordedBitsAsDouble = numRecordedBits; |
| double saturation = numRecordedBitsAsDouble / setSizeAsDouble; |
| double logInverseSaturation = Math.log(1 - saturation) * -1; |
| return (int) (setSizeAsDouble * logInverseSaturation); |
| } |
| |
| public float getSaturation() { |
| int numBitsSet = filter.cardinality(); |
| return (float) numBitsSet / (float) bloomSize; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return RamUsageEstimator.sizeOf(filter.getBits()); |
| } |
| |
| @Override |
| public String toString() { |
| return getClass().getSimpleName() + "(hash=" + hashFunction + ")"; |
| } |
| } |