hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomCalculations.java - asterixdb - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.hyracks.storage.am.bloomfilter.impls;

 /**
  * This class has been taken from cassandra source code with minor modifications.
  */

 /**
  * The following calculations are taken from:
  * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
  * "Bloom Filters - the math"
  * This class's static methods are meant to facilitate the use of the Bloom
  * Filter class by helping to choose correct values of 'bits per element' and
  * 'number of hash functions, k'.
  */
 public class BloomCalculations {

     private static final int minBuckets = 2;
     private static final int minK = 1;

     /**
      * In the following table, the row 'i' shows false positive rates if i buckets
      * per element are used. Column 'j' shows false positive rates if j hash
      * functions are used. The first row is 'i=0', the first column is 'j=0'.
      * Each cell (i,j) the false positive rate determined by using i buckets per
      * element and j hash functions.
      */
     static final double[][] probs = new double[][] {
             { 1.0 }, // dummy row representing 0 buckets per element
             { 1.0, 1.0 }, // dummy row representing 1 buckets per element
             { 1.0, 0.393, 0.400 },
             { 1.0, 0.283, 0.237, 0.253 },
             { 1.0, 0.221, 0.155, 0.147, 0.160 },
             { 1.0, 0.181, 0.109, 0.092, 0.092, 0.101 }, // 5
             { 1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638 },
             { 1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364 },
             { 1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229 },
             { 1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145 },
             { 1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846 }, // 10
             { 1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509 },
             { 1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314 },
             { 1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194 },
             { 1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012 },
             { 1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744 }, // 15
             { 1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459 },
             { 1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302,
                     0.000287, 0.000284 },
             { 1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198,
                     0.000183, 0.000176 },
             { 1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132,
                     0.000118, 0.000111, 0.000109 },
             { 1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05,
                     7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05 } // 20
     }; // the first column is a dummy column representing K=0.

     /**
      * The optimal number of hashes for a given number of bits per element.
      * These values are automatically calculated from the data above.
      */
     private static final int[] optKPerBuckets = new int[probs.length];

     static {
         for (int i = 0; i < probs.length; i++) {
             double min = Double.MAX_VALUE;
             double[] prob = probs[i];
             for (int j = 0; j < prob.length; j++) {
                 if (prob[j] < min) {
                     min = prob[j];
                     optKPerBuckets[i] = Math.max(minK, j);
                 }
             }
         }
     }

     /**
      * Given the number of buckets that can be used per element, return a
      * specification that minimizes the false positive rate.
      *
      * @param bucketsPerElement
      *            The number of buckets per element for the filter.
      * @return A spec that minimizes the false positive rate.
      */
     public static BloomFilterSpecification computeBloomSpec(int bucketsPerElement) {
         assert bucketsPerElement >= 1;
         assert bucketsPerElement <= probs.length - 1;
         return new BloomFilterSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
     }

     /**
      * Given a maximum tolerable false positive probability, compute a Bloom
      * specification which will give less than the specified false positive rate,
      * but minimize the number of buckets per element and the number of hash
      * functions used. Because bandwidth (and therefore total bitvector size)
      * is considered more expensive than computing power, preference is given
      * to minimizing buckets per element rather than number of hash functions.
      *
      * @param maxBucketsPerElement
      *            The maximum number of buckets available for the filter.
      * @param maxFalsePosProb
      *            The maximum tolerable false positive rate.
      * @return A Bloom Specification which would result in a false positive rate
      *         less than specified by the function call
      * @throws UnsupportedOperationException
      *             if a filter satisfying the parameters cannot be met
      */
     public static BloomFilterSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb) {
         assert maxBucketsPerElement >= 1;
         assert maxBucketsPerElement <= probs.length - 1;
         int maxK = probs[maxBucketsPerElement].length - 1;

         // Handle the trivial cases
         if (maxFalsePosProb >= probs[minBuckets][minK]) {
             return new BloomFilterSpecification(2, optKPerBuckets[2]);
         }
         if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
             throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
                     maxFalsePosProb, maxBucketsPerElement));
         }

         // First find the minimal required number of buckets:
         int bucketsPerElement = 2;
         int K = optKPerBuckets[2];
         while (probs[bucketsPerElement][K] > maxFalsePosProb) {
             bucketsPerElement++;
             K = optKPerBuckets[bucketsPerElement];
         }
         // Now that the number of buckets is sufficient, see if we can relax K
         // without losing too much precision.
         while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
             K--;
         }

         return new BloomFilterSpecification(K, bucketsPerElement);
     }

     /**
      * Calculates the maximum number of buckets per element that this implementation
      * can support. Crucially, it will lower the bucket count if necessary to meet
      * BitSet's size restrictions.
      */
     public static int maxBucketsPerElement(long numElements) {
         numElements = Math.max(1, numElements);
         double v = Long.MAX_VALUE / (double) numElements;
         if (v < 1.0) {
             throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
         }
         return Math.min(BloomCalculations.probs.length - 1, (int) v);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.hyracks.storage.am.bloomfilter.impls;

	/**
	* This class has been taken from cassandra source code with minor modifications.
	*/

	/**
	* The following calculations are taken from:
	* http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
	* "Bloom Filters - the math"
	* This class's static methods are meant to facilitate the use of the Bloom
	* Filter class by helping to choose correct values of 'bits per element' and
	* 'number of hash functions, k'.
	*/
	public class BloomCalculations {

	private static final int minBuckets = 2;
	private static final int minK = 1;

	/**
	* In the following table, the row 'i' shows false positive rates if i buckets
	* per element are used. Column 'j' shows false positive rates if j hash
	* functions are used. The first row is 'i=0', the first column is 'j=0'.
	* Each cell (i,j) the false positive rate determined by using i buckets per
	* element and j hash functions.
	*/
	static final double[][] probs = new double[][] {
	{ 1.0 }, // dummy row representing 0 buckets per element
	{ 1.0, 1.0 }, // dummy row representing 1 buckets per element
	{ 1.0, 0.393, 0.400 },
	{ 1.0, 0.283, 0.237, 0.253 },
	{ 1.0, 0.221, 0.155, 0.147, 0.160 },
	{ 1.0, 0.181, 0.109, 0.092, 0.092, 0.101 }, // 5
	{ 1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638 },
	{ 1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364 },
	{ 1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229 },
	{ 1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145 },
	{ 1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846 }, // 10
	{ 1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509 },
	{ 1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314 },
	{ 1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194 },
	{ 1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012 },
	{ 1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744 }, // 15
	{ 1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459 },
	{ 1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302,
	0.000287, 0.000284 },
	{ 1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198,
	0.000183, 0.000176 },
	{ 1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132,
	0.000118, 0.000111, 0.000109 },
	{ 1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05,
	7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05 } // 20
	}; // the first column is a dummy column representing K=0.

	/**
	* The optimal number of hashes for a given number of bits per element.
	* These values are automatically calculated from the data above.
	*/
	private static final int[] optKPerBuckets = new int[probs.length];

	static {
	for (int i = 0; i < probs.length; i++) {
	double min = Double.MAX_VALUE;
	double[] prob = probs[i];
	for (int j = 0; j < prob.length; j++) {
	if (prob[j] < min) {
	min = prob[j];
	optKPerBuckets[i] = Math.max(minK, j);
	}
	}
	}
	}

	/**
	* Given the number of buckets that can be used per element, return a
	* specification that minimizes the false positive rate.
	*
	* @param bucketsPerElement
	* The number of buckets per element for the filter.
	* @return A spec that minimizes the false positive rate.
	*/
	public static BloomFilterSpecification computeBloomSpec(int bucketsPerElement) {
	assert bucketsPerElement >= 1;
	assert bucketsPerElement <= probs.length - 1;
	return new BloomFilterSpecification(optKPerBuckets[bucketsPerElement], bucketsPerElement);
	}

	/**
	* Given a maximum tolerable false positive probability, compute a Bloom
	* specification which will give less than the specified false positive rate,
	* but minimize the number of buckets per element and the number of hash
	* functions used. Because bandwidth (and therefore total bitvector size)
	* is considered more expensive than computing power, preference is given
	* to minimizing buckets per element rather than number of hash functions.
	*
	* @param maxBucketsPerElement
	* The maximum number of buckets available for the filter.
	* @param maxFalsePosProb
	* The maximum tolerable false positive rate.
	* @return A Bloom Specification which would result in a false positive rate
	* less than specified by the function call
	* @throws UnsupportedOperationException
	* if a filter satisfying the parameters cannot be met
	*/
	public static BloomFilterSpecification computeBloomSpec(int maxBucketsPerElement, double maxFalsePosProb) {
	assert maxBucketsPerElement >= 1;
	assert maxBucketsPerElement <= probs.length - 1;
	int maxK = probs[maxBucketsPerElement].length - 1;

	// Handle the trivial cases
	if (maxFalsePosProb >= probs[minBuckets][minK]) {
	return new BloomFilterSpecification(2, optKPerBuckets[2]);
	}
	if (maxFalsePosProb < probs[maxBucketsPerElement][maxK]) {
	throw new UnsupportedOperationException(String.format("Unable to satisfy %s with %s buckets per element",
	maxFalsePosProb, maxBucketsPerElement));
	}

	// First find the minimal required number of buckets:
	int bucketsPerElement = 2;
	int K = optKPerBuckets[2];
	while (probs[bucketsPerElement][K] > maxFalsePosProb) {
	bucketsPerElement++;
	K = optKPerBuckets[bucketsPerElement];
	}
	// Now that the number of buckets is sufficient, see if we can relax K
	// without losing too much precision.
	while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
	K--;
	}

	return new BloomFilterSpecification(K, bucketsPerElement);
	}

	/**
	* Calculates the maximum number of buckets per element that this implementation
	* can support. Crucially, it will lower the bucket count if necessary to meet
	* BitSet's size restrictions.
	*/
	public static int maxBucketsPerElement(long numElements) {
	numElements = Math.max(1, numElements);
	double v = Long.MAX_VALUE / (double) numElements;
	if (v < 1.0) {
	throw new UnsupportedOperationException("Cannot compute probabilities for " + numElements + " elements.");
	}
	return Math.min(BloomCalculations.probs.length - 1, (int) v);
	}
	}