src/main/java/org/apache/commons/codec/digest/MurmurHash2.java - commons-codec - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.commons.codec.digest;

 import org.apache.commons.codec.binary.StringUtils;

 /**
  * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
  *
  * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  * it is not specifically designed to be difficult to reverse by an adversary,
  * making it unsuitable for cryptographic purposes.</p>
  *
  * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
  * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's
  * original {@code c++} code in SMHasher.</p>
  *
  * <p>This is a re-implementation of the original C code plus some additional
  * features.</p>
  *
  * <p>This is public domain code with no copyrights. From home page of
  * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
  *
  * <blockquote>
  * "All MurmurHash versions are public domain software, and the author
  * disclaims all copyright to their code."
  * </blockquote>
  *
  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
  * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
  *   Original MurmurHash2 c++ code</a>
  * @since 1.13
  */
 public final class MurmurHash2 {

     // Constants for 32-bit variant
     private static final int M32 = 0x5bd1e995;
     private static final int R32 = 24;

     // Constants for 64-bit variant
     private static final long M64 = 0xc6a4a7935bd1e995L;
     private static final int R64 = 47;

     /** No instance methods. */
     private MurmurHash2() {
     }

     /**
      * Generates a 32-bit hash from byte array with the given length and seed.
      *
      * @param data The input byte array
      * @param length The length of the array
      * @param seed The initial seed value
      * @return The 32-bit hash
      */
     public static int hash32(final byte[] data, final int length, final int seed) {
         // Initialize the hash to a random value
         int h = seed ^ length;

         // Mix 4 bytes at a time into the hash
         final int nblocks = length >> 2;

         // body
         for (int i = 0; i < nblocks; i++) {
             final int index = (i << 2);
             int k = getLittleEndianInt(data, index);
             k *= M32;
             k ^= k >>> R32;
             k *= M32;
             h *= M32;
             h ^= k;
         }

         // Handle the last few bytes of the input array
         final int index = (nblocks << 2);
         switch (length - index) {
         case 3:
             h ^= (data[index + 2] & 0xff) << 16;
         case 2:
             h ^= (data[index + 1] & 0xff) << 8;
         case 1:
             h ^= (data[index] & 0xff);
             h *= M32;
         }

         // Do a few final mixes of the hash to ensure the last few
         // bytes are well-incorporated.
         h ^= h >>> 13;
         h *= M32;
         h ^= h >>> 15;

         return h;
     }

     /**
      * Generates a 32-bit hash from byte array with the given length and a default seed value.
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0x9747b28c;
      * int hash = MurmurHash2.hash32(data, length, seed);
      * </pre>
      *
      * @param data The input byte array
      * @param length The length of the array
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int)
      */
     public static int hash32(final byte[] data, final int length) {
         return hash32(data, length, 0x9747b28c);
     }

     /**
      * Generates a 32-bit hash from a string with a default seed.
      * <p>
      * Before 1.14 the string was converted using default encoding.
      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
      * </p>
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0x9747b28c;
      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
      * </pre>
      *
      * @param text The input string
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text) {
         final byte[] bytes = StringUtils.getBytesUtf8(text);
         return hash32(bytes, bytes.length);
     }

     /**
      * Generates a 32-bit hash from a substring with a default seed value.
      * The string is converted to bytes using the default encoding.
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0x9747b28c;
      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
      * </pre>
      *
      * @param text The input string
      * @param from The starting index
      * @param length The length of the substring
      * @return The 32-bit hash
      * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text, final int from, final int length) {
         return hash32(text.substring(from, from + length));
     }

     /**
      * Generates a 64-bit hash from byte array of the given length and seed.
      *
      * @param data The input byte array
      * @param length The length of the array
      * @param seed The initial seed value
      * @return The 64-bit hash of the given array
      */
     public static long hash64(final byte[] data, final int length, final int seed) {
         long h = (seed & 0xffffffffL) ^ (length * M64);

         final int nblocks = length >> 3;

         // body
         for (int i = 0; i < nblocks; i++) {
             final int index = (i << 3);
             long k = getLittleEndianLong(data, index);

             k *= M64;
             k ^= k >>> R64;
             k *= M64;

             h ^= k;
             h *= M64;
         }

         final int index = (nblocks << 3);
         switch (length - index) {
         case 7:
             h ^= ((long) data[index + 6] & 0xff) << 48;
         case 6:
             h ^= ((long) data[index + 5] & 0xff) << 40;
         case 5:
             h ^= ((long) data[index + 4] & 0xff) << 32;
         case 4:
             h ^= ((long) data[index + 3] & 0xff) << 24;
         case 3:
             h ^= ((long) data[index + 2] & 0xff) << 16;
         case 2:
             h ^= ((long) data[index + 1] & 0xff) << 8;
         case 1:
             h ^= ((long) data[index] & 0xff);
             h *= M64;
         }

         h ^= h >>> R64;
         h *= M64;
         h ^= h >>> R64;

         return h;
     }

     /**
      * Generates a 64-bit hash from byte array with given length and a default seed value.
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0xe17a1465;
      * int hash = MurmurHash2.hash64(data, length, seed);
      * </pre>
      *
      * @param data The input byte array
      * @param length The length of the array
      * @return The 64-bit hash
      * @see #hash64(byte[], int, int)
      */
     public static long hash64(final byte[] data, final int length) {
         return hash64(data, length, 0xe17a1465);
     }

     /**
      * Generates a 64-bit hash from a string with a default seed.
      * <p>
      * Before 1.14 the string was converted using default encoding.
      * Since 1.14 the string is converted to bytes using UTF-8 encoding.
      * </p>
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0xe17a1465;
      * byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
      * </pre>
      *
      * @param text The input string
      * @return The 64-bit hash
      * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text) {
         final byte[] bytes = StringUtils.getBytesUtf8(text);
         return hash64(bytes, bytes.length);
     }

     /**
      * Generates a 64-bit hash from a substring with a default seed value.
      * The string is converted to bytes using the default encoding.
      * This is a helper method that will produce the same result as:
      *
      * <pre>
      * int seed = 0xe17a1465;
      * byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
      * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
      * </pre>
      *
      * @param text The The input string
      * @param from The starting index
      * @param length The length of the substring
      * @return The 64-bit hash
      * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text, final int from, final int length) {
         return hash64(text.substring(from, from + length));
     }

     /**
      * Gets the little-endian int from 4 bytes starting at the specified index.
      *
      * @param data The data
      * @param index The index
      * @return The little-endian int
      */
     private static int getLittleEndianInt(final byte[] data, final int index) {
         return ((data[index    ] & 0xff)      ) |
                ((data[index + 1] & 0xff) <<  8) |
                ((data[index + 2] & 0xff) << 16) |
                ((data[index + 3] & 0xff) << 24);
     }

     /**
      * Gets the little-endian long from 8 bytes starting at the specified index.
      *
      * @param data The data
      * @param index The index
      * @return The little-endian long
      */
     private static long getLittleEndianLong(final byte[] data, final int index) {
         return (((long) data[index    ] & 0xff)      ) |
                (((long) data[index + 1] & 0xff) <<  8) |
                (((long) data[index + 2] & 0xff) << 16) |
                (((long) data[index + 3] & 0xff) << 24) |
                (((long) data[index + 4] & 0xff) << 32) |
                (((long) data[index + 5] & 0xff) << 40) |
                (((long) data[index + 6] & 0xff) << 48) |
                (((long) data[index + 7] & 0xff) << 56);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.commons.codec.digest;

	import org.apache.commons.codec.binary.StringUtils;

	/**
	* Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
	*
	* <p>MurmurHash is a non-cryptographic hash function suitable for general
	* hash-based lookup. The name comes from two basic operations, multiply (MU)
	* and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
	* it is not specifically designed to be difficult to reverse by an adversary,
	* making it unsuitable for cryptographic purposes.</p>
	*
	* <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
	* and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's
	* original {@code c++} code in SMHasher.</p>
	*
	* <p>This is a re-implementation of the original C code plus some additional
	* features.</p>
	*
	* <p>This is public domain code with no copyrights. From home page of
	* <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p>
	*
	* <blockquote>
	* "All MurmurHash versions are public domain software, and the author
	* disclaims all copyright to their code."
	* </blockquote>
	*
	* @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a>
	* @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp">
	* Original MurmurHash2 c++ code</a>
	* @since 1.13
	*/
	public final class MurmurHash2 {

	// Constants for 32-bit variant
	private static final int M32 = 0x5bd1e995;
	private static final int R32 = 24;

	// Constants for 64-bit variant
	private static final long M64 = 0xc6a4a7935bd1e995L;
	private static final int R64 = 47;

	/** No instance methods. */
	private MurmurHash2() {
	}

	/**
	* Generates a 32-bit hash from byte array with the given length and seed.
	*
	* @param data The input byte array
	* @param length The length of the array
	* @param seed The initial seed value
	* @return The 32-bit hash
	*/
	public static int hash32(final byte[] data, final int length, final int seed) {
	// Initialize the hash to a random value
	int h = seed ^ length;

	// Mix 4 bytes at a time into the hash
	final int nblocks = length >> 2;

	// body
	for (int i = 0; i < nblocks; i++) {
	final int index = (i << 2);
	int k = getLittleEndianInt(data, index);
	k *= M32;
	k ^= k >>> R32;
	k *= M32;
	h *= M32;
	h ^= k;
	}

	// Handle the last few bytes of the input array
	final int index = (nblocks << 2);
	switch (length - index) {
	case 3:
	h ^= (data[index + 2] & 0xff) << 16;
	case 2:
	h ^= (data[index + 1] & 0xff) << 8;
	case 1:
	h ^= (data[index] & 0xff);
	h *= M32;
	}

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.
	h ^= h >>> 13;
	h *= M32;
	h ^= h >>> 15;

	return h;
	}

	/**
	* Generates a 32-bit hash from byte array with the given length and a default seed value.
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0x9747b28c;
	* int hash = MurmurHash2.hash32(data, length, seed);
	* </pre>
	*
	* @param data The input byte array
	* @param length The length of the array
	* @return The 32-bit hash
	* @see #hash32(byte[], int, int)
	*/
	public static int hash32(final byte[] data, final int length) {
	return hash32(data, length, 0x9747b28c);
	}

	/**
	* Generates a 32-bit hash from a string with a default seed.
	* <p>
	* Before 1.14 the string was converted using default encoding.
	* Since 1.14 the string is converted to bytes using UTF-8 encoding.
	* </p>
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0x9747b28c;
	* byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
	* int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
	* </pre>
	*
	* @param text The input string
	* @return The 32-bit hash
	* @see #hash32(byte[], int, int)
	*/
	public static int hash32(final String text) {
	final byte[] bytes = StringUtils.getBytesUtf8(text);
	return hash32(bytes, bytes.length);
	}

	/**
	* Generates a 32-bit hash from a substring with a default seed value.
	* The string is converted to bytes using the default encoding.
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0x9747b28c;
	* byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
	* int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
	* </pre>
	*
	* @param text The input string
	* @param from The starting index
	* @param length The length of the substring
	* @return The 32-bit hash
	* @see #hash32(byte[], int, int)
	*/
	public static int hash32(final String text, final int from, final int length) {
	return hash32(text.substring(from, from + length));
	}

	/**
	* Generates a 64-bit hash from byte array of the given length and seed.
	*
	* @param data The input byte array
	* @param length The length of the array
	* @param seed The initial seed value
	* @return The 64-bit hash of the given array
	*/
	public static long hash64(final byte[] data, final int length, final int seed) {
	long h = (seed & 0xffffffffL) ^ (length * M64);

	final int nblocks = length >> 3;

	// body
	for (int i = 0; i < nblocks; i++) {
	final int index = (i << 3);
	long k = getLittleEndianLong(data, index);

	k *= M64;
	k ^= k >>> R64;
	k *= M64;

	h ^= k;
	h *= M64;
	}

	final int index = (nblocks << 3);
	switch (length - index) {
	case 7:
	h ^= ((long) data[index + 6] & 0xff) << 48;
	case 6:
	h ^= ((long) data[index + 5] & 0xff) << 40;
	case 5:
	h ^= ((long) data[index + 4] & 0xff) << 32;
	case 4:
	h ^= ((long) data[index + 3] & 0xff) << 24;
	case 3:
	h ^= ((long) data[index + 2] & 0xff) << 16;
	case 2:
	h ^= ((long) data[index + 1] & 0xff) << 8;
	case 1:
	h ^= ((long) data[index] & 0xff);
	h *= M64;
	}

	h ^= h >>> R64;
	h *= M64;
	h ^= h >>> R64;

	return h;
	}

	/**
	* Generates a 64-bit hash from byte array with given length and a default seed value.
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0xe17a1465;
	* int hash = MurmurHash2.hash64(data, length, seed);
	* </pre>
	*
	* @param data The input byte array
	* @param length The length of the array
	* @return The 64-bit hash
	* @see #hash64(byte[], int, int)
	*/
	public static long hash64(final byte[] data, final int length) {
	return hash64(data, length, 0xe17a1465);
	}

	/**
	* Generates a 64-bit hash from a string with a default seed.
	* <p>
	* Before 1.14 the string was converted using default encoding.
	* Since 1.14 the string is converted to bytes using UTF-8 encoding.
	* </p>
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0xe17a1465;
	* byte[] bytes = data.getBytes(StandardCharsets.UTF_8);
	* int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
	* </pre>
	*
	* @param text The input string
	* @return The 64-bit hash
	* @see #hash64(byte[], int, int)
	*/
	public static long hash64(final String text) {
	final byte[] bytes = StringUtils.getBytesUtf8(text);
	return hash64(bytes, bytes.length);
	}

	/**
	* Generates a 64-bit hash from a substring with a default seed value.
	* The string is converted to bytes using the default encoding.
	* This is a helper method that will produce the same result as:
	*
	* <pre>
	* int seed = 0xe17a1465;
	* byte[] bytes = text.substring(from, from + length).getBytes(StandardCharsets.UTF_8);
	* int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
	* </pre>
	*
	* @param text The The input string
	* @param from The starting index
	* @param length The length of the substring
	* @return The 64-bit hash
	* @see #hash64(byte[], int, int)
	*/
	public static long hash64(final String text, final int from, final int length) {
	return hash64(text.substring(from, from + length));
	}

	/**
	* Gets the little-endian int from 4 bytes starting at the specified index.
	*
	* @param data The data
	* @param index The index
	* @return The little-endian int
	*/
	private static int getLittleEndianInt(final byte[] data, final int index) {
	return ((data[index ] & 0xff) ) \|
	((data[index + 1] & 0xff) << 8) \|
	((data[index + 2] & 0xff) << 16) \|
	((data[index + 3] & 0xff) << 24);
	}

	/**
	* Gets the little-endian long from 8 bytes starting at the specified index.
	*
	* @param data The data
	* @param index The index
	* @return The little-endian long
	*/
	private static long getLittleEndianLong(final byte[] data, final int index) {
	return (((long) data[index ] & 0xff) ) \|
	(((long) data[index + 1] & 0xff) << 8) \|
	(((long) data[index + 2] & 0xff) << 16) \|
	(((long) data[index + 3] & 0xff) << 24) \|
	(((long) data[index + 4] & 0xff) << 32) \|
	(((long) data[index + 5] & 0xff) << 40) \|
	(((long) data[index + 6] & 0xff) << 48) \|
	(((long) data[index + 7] & 0xff) << 56);
	}
	}