kernel/hash.hpp - tuscany-sca-cpp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /* $Rev$ $Date$ */

 #ifndef tuscany_hash_hpp
 #define tuscany_hash_hpp

 /**
  * Fast hash functions.
  */

 #include <apr_hash.h>
 #include <apr_memcache.h>

 namespace tuscany
 {

 /**
  * Apache apr-util CRC32 hash function.
  *
  * See srclib/apr-util/memcache/apr_memcache.c from the Apache HTTPD
  * source tree. Reproducing the comments from apr_memcache.c here:
  *
  * The crc32 functions and data were originally written by Spencer
  * Garrett <srg@quick.com> and were gleaned from the PostgreSQL source
  * tree at contrib/ltree/crc32.[ch] and from FreeBSD at
  * src/usr.bin/cksum/crc32.c.
  */
 const unsigned int crc32hash(const char* data, const size_t len) {
     return (unsigned int)apr_memcache_hash_default(NULL, data, len);
 }

 /**
  * Apache apr tables default hash function.
  *
  * See srclib/apr/tables/apr_hash.c from the Apache HTTPD source tree.
  * Reproducing the comments from apr_hash.c here:
  *
  * This is the popular `times 33' hash algorithm which is used by
  * perl and also appears in Berkeley DB. This is one of the best
  * known hash functions for strings because it is both computed
  * very fast and distributes very well.
  *
  * The originator may be Dan Bernstein but the code in Berkeley DB
  * cites Chris Torek as the source. The best citation I have found
  * is "Chris Torek, Hash function for text in C, Usenet message
  * <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich
  * Salz's USENIX 1992 paper about INN which can be found at
  * <http://citeseer.nj.nec.com/salz92internetnews.html>.
  *
  * The magic of number 33, i.e. why it works better than many other
  * constants, prime or not, has never been adequately explained by
  * anyone. So I try an explanation: if one experimentally tests all
  * multipliers between 1 and 256 (as I did while writing a low-level
  * data structure library some time ago) one detects that even
  * numbers are not useable at all. The remaining 128 odd numbers
  * (except for the number 1) work more or less all equally well.
  * They all distribute in an acceptable way and this way fill a hash
  * table with an average percent of approx. 86%.
  *
  * If one compares the chi^2 values of the variants (see
  * Bob Jenkins ``Hashing Frequently Asked Questions'' at
  * http://burtleburtle.net/bob/hash/hashfaq.html for a description
  * of chi^2), the number 33 not even has the best value. But the
  * number 33 and a few other equally good numbers like 17, 31, 63,
  * 127 and 129 have nevertheless a great advantage to the remaining
  * numbers in the large set of possible multipliers: their multiply
  * operation can be replaced by a faster operation based on just one
  * shift plus either a single addition or subtraction operation. And
  * because a hash function has to both distribute good _and_ has to
  * be very fast to compute, those few numbers should be preferred.
  *
  *                  -- Ralf S. Engelschall <rse@engelschall.com>
  */
 const unsigned int times33hash(const char* data, const size_t len) {
     apr_ssize_t l = len;
     return apr_hashfunc_default(data, &l);
 }

 /**
  * A very fast, non-cryptographic hash suitable for general hash-based
  * lookup. See http://murmurhash.googlepages.com/ for more details.
  *
  * Original code by Austin Appleby, released to the public domain and under
  * the MIT license.
  *
  * Compiles down to ~52 instructions on x86.
  * Passes chi^2 tests for practically all keysets & bucket sizes.
  * Excellent avalanche behavior. Maximum bias is under 0.5%.
  * Passes Bob Jenkin's frog.c torture-test. No collisions possible for 4 byte
  * keys, no small 1 to 7 bit differentials.
  */
 const unsigned int murmurhash(const char* key, const size_t klen) {
     unsigned int len = (unsigned int)klen;
     const unsigned int seed = 0;

     // 'm' and 'r' are mixing constants generated offline.
     // They're not really 'magic', they just happen to work well.
     const unsigned int m = 0x5bd1e995;
     const int r = 24;

     // Initialize the hash to a 'random' value
     unsigned int h = seed ^ len;

     // Mix 4 bytes at a time into the hash
     const unsigned char* data = (const unsigned char*)key;
     while(len >= 4) {
         unsigned int k = *(unsigned int*)(void*)data;
         k *= m;
         k ^= k >> r;
         k *= m;
         h *= m;
         h ^= k;
         data += 4;
         len -= 4;
     }

     // Handle the last few bytes of the input array
     switch(len) {
     case 3: h ^= data[2] << 16;
     case 2: h ^= data[1] << 8;
     case 1: h ^= data[0];
             h *= m;
     };

     // Do a few final mixes of the hash to ensure the last few
     // bytes are well-incorporated.
     h ^= h >> 13;
     h *= m;
     h ^= h >> 15;

     return h;
 }

 /**
  * An endian and alignment neutral, but half the speed, version of
  * the murmur hash.
  */
 const unsigned int portablemurmurhash(const char* key, const size_t klen) {
     unsigned int len = (unsigned int)klen;
     const unsigned int seed = 0;

     // 'm' and 'r' are mixing constants generated offline.
     // They're not really 'magic', they just happen to work well.
     const unsigned int m = 0x5bd1e995;
     const int r = 24;

     // Initialize the hash to a 'random' value
     unsigned int h = seed ^ len;

     // Mix 4 bytes at a time into the hash
     const unsigned char* data = (const unsigned char *)key;
     while(len >= 4) {
         unsigned int k;
         k  = data[0];
         k |= data[1] << 8;
         k |= data[2] << 16;
         k |= data[3] << 24;
         k *= m;
         k ^= k >> r;
         k *= m;
         h *= m;
         h ^= k;
         data += 4;
         len -= 4;
     }

     // Handle the last few bytes of the input array
     switch(len) {
     case 3: h ^= data[2] << 16;
     case 2: h ^= data[1] << 8;
     case 1: h ^= data[0];
             h *= m;
     };

     // Do a few final mixes of the hash to ensure the last few
     // bytes are well-incorporated.
     h ^= h >> 13;
     h *= m;
     h ^= h >> 15;

     return h;
 }

 const unsigned int hashselect(const unsigned int hash, const unsigned int max) {
     return hash % max;
 }

 }
 #endif /* tuscany_hash_hpp */
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/* $Rev$ $Date$ */

	#ifndef tuscany_hash_hpp
	#define tuscany_hash_hpp

	/**
	* Fast hash functions.
	*/

	#include <apr_hash.h>
	#include <apr_memcache.h>

	namespace tuscany
	{

	/**
	* Apache apr-util CRC32 hash function.
	*
	* See srclib/apr-util/memcache/apr_memcache.c from the Apache HTTPD
	* source tree. Reproducing the comments from apr_memcache.c here:
	*
	* The crc32 functions and data were originally written by Spencer
	* Garrett <srg@quick.com> and were gleaned from the PostgreSQL source
	* tree at contrib/ltree/crc32.[ch] and from FreeBSD at
	* src/usr.bin/cksum/crc32.c.
	*/
	const unsigned int crc32hash(const char* data, const size_t len) {
	return (unsigned int)apr_memcache_hash_default(NULL, data, len);
	}

	/**
	* Apache apr tables default hash function.
	*
	* See srclib/apr/tables/apr_hash.c from the Apache HTTPD source tree.
	* Reproducing the comments from apr_hash.c here:
	*
	* This is the popular `times 33' hash algorithm which is used by
	* perl and also appears in Berkeley DB. This is one of the best
	* known hash functions for strings because it is both computed
	* very fast and distributes very well.
	*
	* The originator may be Dan Bernstein but the code in Berkeley DB
	* cites Chris Torek as the source. The best citation I have found
	* is "Chris Torek, Hash function for text in C, Usenet message
	* <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich
	* Salz's USENIX 1992 paper about INN which can be found at
	* <http://citeseer.nj.nec.com/salz92internetnews.html>.
	*
	* The magic of number 33, i.e. why it works better than many other
	* constants, prime or not, has never been adequately explained by
	* anyone. So I try an explanation: if one experimentally tests all
	* multipliers between 1 and 256 (as I did while writing a low-level
	* data structure library some time ago) one detects that even
	* numbers are not useable at all. The remaining 128 odd numbers
	* (except for the number 1) work more or less all equally well.
	* They all distribute in an acceptable way and this way fill a hash
	* table with an average percent of approx. 86%.
	*
	* If one compares the chi^2 values of the variants (see
	* Bob Jenkins ``Hashing Frequently Asked Questions'' at
	* http://burtleburtle.net/bob/hash/hashfaq.html for a description
	* of chi^2), the number 33 not even has the best value. But the
	* number 33 and a few other equally good numbers like 17, 31, 63,
	* 127 and 129 have nevertheless a great advantage to the remaining
	* numbers in the large set of possible multipliers: their multiply
	* operation can be replaced by a faster operation based on just one
	* shift plus either a single addition or subtraction operation. And
	* because a hash function has to both distribute good _and_ has to
	* be very fast to compute, those few numbers should be preferred.
	*
	* -- Ralf S. Engelschall <rse@engelschall.com>
	*/
	const unsigned int times33hash(const char* data, const size_t len) {
	apr_ssize_t l = len;
	return apr_hashfunc_default(data, &l);
	}

	/**
	* A very fast, non-cryptographic hash suitable for general hash-based
	* lookup. See http://murmurhash.googlepages.com/ for more details.
	*
	* Original code by Austin Appleby, released to the public domain and under
	* the MIT license.
	*
	* Compiles down to ~52 instructions on x86.
	* Passes chi^2 tests for practically all keysets & bucket sizes.
	* Excellent avalanche behavior. Maximum bias is under 0.5%.
	* Passes Bob Jenkin's frog.c torture-test. No collisions possible for 4 byte
	* keys, no small 1 to 7 bit differentials.
	*/
	const unsigned int murmurhash(const char* key, const size_t klen) {
	unsigned int len = (unsigned int)klen;
	const unsigned int seed = 0;

	// 'm' and 'r' are mixing constants generated offline.
	// They're not really 'magic', they just happen to work well.
	const unsigned int m = 0x5bd1e995;
	const int r = 24;

	// Initialize the hash to a 'random' value
	unsigned int h = seed ^ len;

	// Mix 4 bytes at a time into the hash
	const unsigned char* data = (const unsigned char*)key;
	while(len >= 4) {
	unsigned int k = (unsigned int)(void*)data;
	k *= m;
	k ^= k >> r;
	k *= m;
	h *= m;
	h ^= k;
	data += 4;
	len -= 4;
	}

	// Handle the last few bytes of the input array
	switch(len) {
	case 3: h ^= data[2] << 16;
	case 2: h ^= data[1] << 8;
	case 1: h ^= data[0];
	h *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.
	h ^= h >> 13;
	h *= m;
	h ^= h >> 15;

	return h;
	}

	/**
	* An endian and alignment neutral, but half the speed, version of
	* the murmur hash.
	*/
	const unsigned int portablemurmurhash(const char* key, const size_t klen) {
	unsigned int len = (unsigned int)klen;
	const unsigned int seed = 0;

	// 'm' and 'r' are mixing constants generated offline.
	// They're not really 'magic', they just happen to work well.
	const unsigned int m = 0x5bd1e995;
	const int r = 24;

	// Initialize the hash to a 'random' value
	unsigned int h = seed ^ len;

	// Mix 4 bytes at a time into the hash
	const unsigned char* data = (const unsigned char *)key;
	while(len >= 4) {
	unsigned int k;
	k = data[0];
	k \|= data[1] << 8;
	k \|= data[2] << 16;
	k \|= data[3] << 24;
	k *= m;
	k ^= k >> r;
	k *= m;
	h *= m;
	h ^= k;
	data += 4;
	len -= 4;
	}

	// Handle the last few bytes of the input array
	switch(len) {
	case 3: h ^= data[2] << 16;
	case 2: h ^= data[1] << 8;
	case 1: h ^= data[0];
	h *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.
	h ^= h >> 13;
	h *= m;
	h ^= h >> 15;

	return h;
	}

	const unsigned int hashselect(const unsigned int hash, const unsigned int max) {
	return hash % max;
	}

	}
	#endif /* tuscany_hash_hpp */