common/src/java/org/apache/hadoop/util/bloom/CountingBloomFilter.java - hadoop - Git at Google

 /**
  *
  * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
  * All rights reserved.
  * Redistribution and use in source and binary forms, with or
  * without modification, are permitted provided that the following
  * conditions are met:
  *  - Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  - Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *  - Neither the name of the University Catholique de Louvain - UCL
  *    nor the names of its contributors may be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.util.bloom;

 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;

 /**
  * Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
  * 2000 paper.
  * <p>
  * A counting Bloom filter is an improvement to standard a Bloom filter as it
  * allows dynamic additions and deletions of set membership information.  This
  * is achieved through the use of a counting vector instead of a bit vector.
  * <p>
  * Originally created by
  * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
  *
  * @see Filter The general behavior of a filter
  *
  * @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
  */
 public final class CountingBloomFilter extends Filter {
   /** Storage for the counting buckets */
   private long[] buckets;

   /** We are using 4bit buckets, so each bucket can count to 15 */
   private final static long BUCKET_MAX_VALUE = 15;

   /** Default constructor - use with readFields */
   public CountingBloomFilter() {}

   /**
    * Constructor
    * @param vectorSize The vector size of <i>this</i> filter.
    * @param nbHash The number of hash function to consider.
    * @param hashType type of the hashing function (see
    * {@link org.apache.hadoop.util.hash.Hash}).
    */
   public CountingBloomFilter(int vectorSize, int nbHash, int hashType) {
     super(vectorSize, nbHash, hashType);
     buckets = new long[buckets2words(vectorSize)];
   }

   /** returns the number of 64 bit words it would take to hold vectorSize buckets */
   private static int buckets2words(int vectorSize) {
    return ((vectorSize - 1) >>> 4) + 1;
   }


   @Override
   public void add(Key key) {
     if(key == null) {
       throw new NullPointerException("key can not be null");
     }

     int[] h = hash.hash(key);
     hash.clear();

     for(int i = 0; i < nbHash; i++) {
       // find the bucket
       int wordNum = h[i] >> 4;          // div 16
       int bucketShift = (h[i] & 0x0f) << 2;  // (mod 16) * 4

       long bucketMask = 15L << bucketShift;
       long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

       // only increment if the count in the bucket is less than BUCKET_MAX_VALUE
       if(bucketValue < BUCKET_MAX_VALUE) {
         // increment by 1
         buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
       }
     }
   }

   /**
    * Removes a specified key from <i>this</i> counting Bloom filter.
    * <p>
    * <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter.
    * @param key The key to remove.
    */
   public void delete(Key key) {
     if(key == null) {
       throw new NullPointerException("Key may not be null");
     }
     if(!membershipTest(key)) {
       throw new IllegalArgumentException("Key is not a member");
     }

     int[] h = hash.hash(key);
     hash.clear();

     for(int i = 0; i < nbHash; i++) {
       // find the bucket
       int wordNum = h[i] >> 4;          // div 16
       int bucketShift = (h[i] & 0x0f) << 2;  // (mod 16) * 4

       long bucketMask = 15L << bucketShift;
       long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

       // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
       if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
         // decrement by 1
         buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
       }
     }
   }

   @Override
   public void and(Filter filter) {
     if(filter == null
         || !(filter instanceof CountingBloomFilter)
         || filter.vectorSize != this.vectorSize
         || filter.nbHash != this.nbHash) {
       throw new IllegalArgumentException("filters cannot be and-ed");
     }
     CountingBloomFilter cbf = (CountingBloomFilter)filter;

     int sizeInWords = buckets2words(vectorSize);
     for(int i = 0; i < sizeInWords; i++) {
       this.buckets[i] &= cbf.buckets[i];
     }
   }

   @Override
   public boolean membershipTest(Key key) {
     if(key == null) {
       throw new NullPointerException("Key may not be null");
     }

     int[] h = hash.hash(key);
     hash.clear();

     for(int i = 0; i < nbHash; i++) {
       // find the bucket
       int wordNum = h[i] >> 4;          // div 16
       int bucketShift = (h[i] & 0x0f) << 2;  // (mod 16) * 4

       long bucketMask = 15L << bucketShift;

       if((buckets[wordNum] & bucketMask) == 0) {
         return false;
       }
     }

     return true;
   }

   /**
    * This method calculates an approximate count of the key, i.e. how many
    * times the key was added to the filter. This allows the filter to be
    * used as an approximate <code>key -&gt; count</code> map.
    * <p>NOTE: due to the bucket size of this filter, inserting the same
    * key more than 15 times will cause an overflow at all filter positions
    * associated with this key, and it will significantly increase the error
    * rate for this and other keys. For this reason the filter can only be
    * used to store small count values <code>0 &lt;= N &lt;&lt; 15</code>.
    * @param key key to be tested
    * @return 0 if the key is not present. Otherwise, a positive value v will
    * be returned such that <code>v == count</code> with probability equal to the
    * error rate of this filter, and <code>v &gt; count</code> otherwise.
    * Additionally, if the filter experienced an underflow as a result of
    * {@link #delete(Key)} operation, the return value may be lower than the
    * <code>count</code> with the probability of the false negative rate of such
    * filter.
    */
   public int approximateCount(Key key) {
     int res = Integer.MAX_VALUE;
     int[] h = hash.hash(key);
     hash.clear();
     for (int i = 0; i < nbHash; i++) {
       // find the bucket
       int wordNum = h[i] >> 4;          // div 16
       int bucketShift = (h[i] & 0x0f) << 2;  // (mod 16) * 4

       long bucketMask = 15L << bucketShift;
       long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
       if (bucketValue < res) res = (int)bucketValue;
     }
     if (res != Integer.MAX_VALUE) {
       return res;
     } else {
       return 0;
     }
   }

   @Override
   public void not() {
     throw new UnsupportedOperationException("not() is undefined for "
         + this.getClass().getName());
   }

   @Override
   public void or(Filter filter) {
     if(filter == null
         || !(filter instanceof CountingBloomFilter)
         || filter.vectorSize != this.vectorSize
         || filter.nbHash != this.nbHash) {
       throw new IllegalArgumentException("filters cannot be or-ed");
     }

     CountingBloomFilter cbf = (CountingBloomFilter)filter;

     int sizeInWords = buckets2words(vectorSize);
     for(int i = 0; i < sizeInWords; i++) {
       this.buckets[i] |= cbf.buckets[i];
     }
   }

   @Override
   public void xor(Filter filter) {
     throw new UnsupportedOperationException("xor() is undefined for "
         + this.getClass().getName());
   }

   @Override
   public String toString() {
     StringBuilder res = new StringBuilder();

     for(int i = 0; i < vectorSize; i++) {
       if(i > 0) {
         res.append(" ");
       }

       int wordNum = i >> 4;          // div 16
       int bucketShift = (i & 0x0f) << 2;  // (mod 16) * 4

       long bucketMask = 15L << bucketShift;
       long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

       res.append(bucketValue);
     }

     return res.toString();
   }

   // Writable

   @Override
   public void write(DataOutput out) throws IOException {
     super.write(out);
     int sizeInWords = buckets2words(vectorSize);
     for(int i = 0; i < sizeInWords; i++) {
       out.writeLong(buckets[i]);
     }
   }

   @Override
   public void readFields(DataInput in) throws IOException {
     super.readFields(in);
     int sizeInWords = buckets2words(vectorSize);
     buckets = new long[sizeInWords];
     for(int i = 0; i < sizeInWords; i++) {
       buckets[i] = in.readLong();
     }
   }
 }
	/**
	*
	* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
	* All rights reserved.
	* Redistribution and use in source and binary forms, with or
	* without modification, are permitted provided that the following
	* conditions are met:
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the distribution.
	* - Neither the name of the University Catholique de Louvain - UCL
	* nor the names of its contributors may be used to endorse or
	* promote products derived from this software without specific prior
	* written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.util.bloom;

	import java.io.DataInput;
	import java.io.DataOutput;
	import java.io.IOException;

	/**
	* Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
	* 2000 paper.
	* <p>
	* A counting Bloom filter is an improvement to standard a Bloom filter as it
	* allows dynamic additions and deletions of set membership information. This
	* is achieved through the use of a counting vector instead of a bit vector.
	* <p>
	* Originally created by
	* <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
	*
	* @see Filter The general behavior of a filter
	*
	* @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
	*/
	public final class CountingBloomFilter extends Filter {
	/** Storage for the counting buckets */
	private long[] buckets;

	/** We are using 4bit buckets, so each bucket can count to 15 */
	private final static long BUCKET_MAX_VALUE = 15;

	/** Default constructor - use with readFields */
	public CountingBloomFilter() {}

	/**
	* Constructor
	* @param vectorSize The vector size of <i>this</i> filter.
	* @param nbHash The number of hash function to consider.
	* @param hashType type of the hashing function (see
	* {@link org.apache.hadoop.util.hash.Hash}).
	*/
	public CountingBloomFilter(int vectorSize, int nbHash, int hashType) {
	super(vectorSize, nbHash, hashType);
	buckets = new long[buckets2words(vectorSize)];
	}

	/** returns the number of 64 bit words it would take to hold vectorSize buckets */
	private static int buckets2words(int vectorSize) {
	return ((vectorSize - 1) >>> 4) + 1;
	}


	@Override
	public void add(Key key) {
	if(key == null) {
	throw new NullPointerException("key can not be null");
	}

	int[] h = hash.hash(key);
	hash.clear();

	for(int i = 0; i < nbHash; i++) {
	// find the bucket
	int wordNum = h[i] >> 4; // div 16
	int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4

	long bucketMask = 15L << bucketShift;
	long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

	// only increment if the count in the bucket is less than BUCKET_MAX_VALUE
	if(bucketValue < BUCKET_MAX_VALUE) {
	// increment by 1
	buckets[wordNum] = (buckets[wordNum] & ~bucketMask) \| ((bucketValue + 1) << bucketShift);
	}
	}
	}

	/**
	* Removes a specified key from <i>this</i> counting Bloom filter.
	* <p>
	* <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter.
	* @param key The key to remove.
	*/
	public void delete(Key key) {
	if(key == null) {
	throw new NullPointerException("Key may not be null");
	}
	if(!membershipTest(key)) {
	throw new IllegalArgumentException("Key is not a member");
	}

	int[] h = hash.hash(key);
	hash.clear();

	for(int i = 0; i < nbHash; i++) {
	// find the bucket
	int wordNum = h[i] >> 4; // div 16
	int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4

	long bucketMask = 15L << bucketShift;
	long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

	// only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
	if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
	// decrement by 1
	buckets[wordNum] = (buckets[wordNum] & ~bucketMask) \| ((bucketValue - 1) << bucketShift);
	}
	}
	}

	@Override
	public void and(Filter filter) {
	if(filter == null
	\|\| !(filter instanceof CountingBloomFilter)
	\|\| filter.vectorSize != this.vectorSize
	\|\| filter.nbHash != this.nbHash) {
	throw new IllegalArgumentException("filters cannot be and-ed");
	}
	CountingBloomFilter cbf = (CountingBloomFilter)filter;

	int sizeInWords = buckets2words(vectorSize);
	for(int i = 0; i < sizeInWords; i++) {
	this.buckets[i] &= cbf.buckets[i];
	}
	}

	@Override
	public boolean membershipTest(Key key) {
	if(key == null) {
	throw new NullPointerException("Key may not be null");
	}

	int[] h = hash.hash(key);
	hash.clear();

	for(int i = 0; i < nbHash; i++) {
	// find the bucket
	int wordNum = h[i] >> 4; // div 16
	int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4

	long bucketMask = 15L << bucketShift;

	if((buckets[wordNum] & bucketMask) == 0) {
	return false;
	}
	}

	return true;
	}

	/**
	* This method calculates an approximate count of the key, i.e. how many
	* times the key was added to the filter. This allows the filter to be
	* used as an approximate <code>key -> count</code> map.
	* <p>NOTE: due to the bucket size of this filter, inserting the same
	* key more than 15 times will cause an overflow at all filter positions
	* associated with this key, and it will significantly increase the error
	* rate for this and other keys. For this reason the filter can only be
	* used to store small count values <code>0 <= N << 15</code>.
	* @param key key to be tested
	* @return 0 if the key is not present. Otherwise, a positive value v will
	* be returned such that <code>v == count</code> with probability equal to the
	* error rate of this filter, and <code>v > count</code> otherwise.
	* Additionally, if the filter experienced an underflow as a result of
	* {@link #delete(Key)} operation, the return value may be lower than the
	* <code>count</code> with the probability of the false negative rate of such
	* filter.
	*/
	public int approximateCount(Key key) {
	int res = Integer.MAX_VALUE;
	int[] h = hash.hash(key);
	hash.clear();
	for (int i = 0; i < nbHash; i++) {
	// find the bucket
	int wordNum = h[i] >> 4; // div 16
	int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4

	long bucketMask = 15L << bucketShift;
	long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
	if (bucketValue < res) res = (int)bucketValue;
	}
	if (res != Integer.MAX_VALUE) {
	return res;
	} else {
	return 0;
	}
	}

	@Override
	public void not() {
	throw new UnsupportedOperationException("not() is undefined for "
	+ this.getClass().getName());
	}

	@Override
	public void or(Filter filter) {
	if(filter == null
	\|\| !(filter instanceof CountingBloomFilter)
	\|\| filter.vectorSize != this.vectorSize
	\|\| filter.nbHash != this.nbHash) {
	throw new IllegalArgumentException("filters cannot be or-ed");
	}

	CountingBloomFilter cbf = (CountingBloomFilter)filter;

	int sizeInWords = buckets2words(vectorSize);
	for(int i = 0; i < sizeInWords; i++) {
	this.buckets[i] \|= cbf.buckets[i];
	}
	}

	@Override
	public void xor(Filter filter) {
	throw new UnsupportedOperationException("xor() is undefined for "
	+ this.getClass().getName());
	}

	@Override
	public String toString() {
	StringBuilder res = new StringBuilder();

	for(int i = 0; i < vectorSize; i++) {
	if(i > 0) {
	res.append(" ");
	}

	int wordNum = i >> 4; // div 16
	int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4

	long bucketMask = 15L << bucketShift;
	long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;

	res.append(bucketValue);
	}

	return res.toString();
	}

	// Writable

	@Override
	public void write(DataOutput out) throws IOException {
	super.write(out);
	int sizeInWords = buckets2words(vectorSize);
	for(int i = 0; i < sizeInWords; i++) {
	out.writeLong(buckets[i]);
	}
	}

	@Override
	public void readFields(DataInput in) throws IOException {
	super.readFields(in);
	int sizeInWords = buckets2words(vectorSize);
	buckets = new long[sizeInWords];
	for(int i = 0; i < sizeInWords; i++) {
	buckets[i] = in.readLong();
	}
	}
	}