Merge branch 'Gian-MurmurHash3' into Memory2
diff --git a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
index 3823656..c4deef6 100644
--- a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
+++ b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
@@ -20,6 +20,8 @@
package org.apache.datasketches.hash;
import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
/**
@@ -177,6 +179,57 @@
return hashState.finalMix128(k1, k2, chars << 1); //convert to bytes
}
+ //--Hash of ByteBuffer------------------------------------------------
+ /**
+ * Returns a long array of size 2, which is a 128-bit hash of the input.
+ *
+ * @param buf The input byte buffer. Must be non-null and non-empty.
+ * @param seed A long valued seed.
+ * @return the hash.
+ */
+ public static long[] hash(final ByteBuffer buf, final long seed) {
+ final HashState hashState = new HashState(seed, seed);
+ final ByteBuffer littleEndianBuf;
+
+ if (buf.order() == ByteOrder.LITTLE_ENDIAN) {
+ littleEndianBuf = buf;
+ } else {
+ littleEndianBuf = buf.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+ }
+
+ final int bytes = littleEndianBuf.remaining(); //in bytes
+ final int offset = littleEndianBuf.position();
+
+ // Number of full 128-bit blocks of 16 bytes.
+ // Possible exclusion of a remainder of up to 15 bytes.
+ final int nblocks = bytes >>> 4; //bytes / 16
+
+ // Process the 128-bit blocks (the body) into the hash
+ for (int i = 0; i < nblocks; i++ ) { //16 bytes per block
+ final long k1 = getLong(littleEndianBuf, offset + (i << 4), 8); //0, 16, 32, ...
+ final long k2 = getLong(littleEndianBuf, offset + (i << 4) + 8, 8); //8, 24, 40, ...
+ hashState.blockMix128(k1, k2);
+ }
+
+ // Get the tail index, remainder length
+ final int tail = nblocks << 4; //16 bytes per block
+ final int rem = bytes - tail; // remainder bytes: 0,1,...,15
+
+ // Get the tail
+ final long k1;
+ final long k2;
+ if (rem > 8) { //k1 -> whole; k2 -> partial
+ k1 = getLong(littleEndianBuf, offset + tail, 8);
+ k2 = getLong(littleEndianBuf, offset + tail + 8, rem - 8);
+ }
+ else { //k1 -> whole, partial or 0; k2 == 0
+ k1 = (rem == 0) ? 0 : getLong(littleEndianBuf, offset + tail, rem);
+ k2 = 0;
+ }
+ // Mix the tail into the hash and return
+ return hashState.finalMix128(k1, k2, bytes);
+ }
+
//--Hash of byte[]----------------------------------------------------
/**
* Returns a long array of size 2, which is a 128-bit hash of the input.
@@ -310,6 +363,43 @@
//--Helper methods----------------------------------------------------
/**
+ * Gets a long from the given byte buffer starting at the given position index and continuing for
+ * remainder (rem) bytes. The buffer must be in little-endian order. The bytes are extracted in
+ * little-endian order. The buffer endianness and limit are not checked.
+ *
+ * @param buf The given input byte buffer.
+ * @param index Zero-based index from the start of the byte array.
+ * @param rem Remainder bytes. An integer in the range [1,8].
+ * @return long
+ */
+ private static long getLong(final ByteBuffer buf, final int index, final int rem) {
+ long out = 0L;
+
+ switch (rem) {
+ case 8:
+ out = buf.getLong(index);
+ break;
+ case 7:
+ out ^= (buf.get(index + 6) & 0xFFL) << 48;
+ case 6:
+ out ^= (buf.get(index + 5) & 0xFFL) << 40;
+ case 5:
+ out ^= (buf.get(index + 4) & 0xFFL) << 32;
+ case 4:
+ out ^= (buf.get(index + 3) & 0xFFL) << 24;
+ case 3:
+ out ^= (buf.get(index + 2) & 0xFFL) << 16;
+ case 2:
+ out ^= (buf.get(index + 1) & 0xFFL) << 8;
+ case 1:
+ out ^= buf.get(index) & 0xFFL;
+ }
+
+ return out;
+ }
+
+ //--Helper methods----------------------------------------------------
+ /**
* Gets a long from the given byte array starting at the given byte array index and continuing for
* remainder (rem) bytes. The bytes are extracted in little-endian order. There is no limit
* checking.
diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
index bfb6797..076cd0b 100644
--- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
@@ -27,6 +27,8 @@
import org.apache.datasketches.memory.Memory;
+import java.nio.ByteBuffer;
+
/**
* Although this class is package-private, it provides a single place to define and document
* the common public API for both HllSketch and Union.
@@ -328,6 +330,23 @@
}
/**
+ * Present the given byte buffer as a potential unique item.
+ * Bytes are read from the current position of the buffer until its limit.
+ * If the byte buffer is null or has no bytes remaining, no update attempt is made and the method returns.
+ *
+ * This method will not modify the position, mark, limit, or byte order of the buffer.
+ *
+ * Little-endian order is preferred, but not required. This method may perform better if the provided byte
+ * buffer is in little-endian order.
+ *
+ * @param data The given byte buffer.
+ */
+ public void update(final ByteBuffer data) {
+ if ((data == null) || (data.remaining() == 0)) { return; }
+ couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
+ }
+
+ /**
* Present the given byte array as a potential unique item.
* If the byte array is null or empty no update attempt is made and the method returns.
*
diff --git a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
index 3bfbcd2..7fea076 100644
--- a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
+++ b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
@@ -19,12 +19,18 @@
package org.apache.datasketches.hash;
+import static java.nio.charset.StandardCharsets.UTF_16;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static java.nio.charset.StandardCharsets.UTF_8;
import org.testng.Assert;
import org.testng.annotations.Test;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+
/**
* Tests the MurmurHash3 against specific, known hash results given known
* inputs obtained from the public domain C++ version 150.
@@ -47,6 +53,18 @@
}
@Test
+ public void checkByteBufRemainderGT8() { //byte buffer, remainder > 8
+ String keyStr = "The quick brown fox jumps over the lazy dog";
+ byte[] key = keyStr.getBytes(UTF_8);
+
+ //Should be:
+ long h1 = 0xe34bbc7bbc071b6cL;
+ long h2 = 0x7a433ca9c49a9347L;
+
+ checkHashByteBuf(key, h1, h2);
+ }
+
+ @Test
public void checkByteArrChange1bit() { //byte[], change one bit
String keyStr = "The quick brown fox jumps over the lazy eog";
byte[] key = keyStr.getBytes(UTF_8);
@@ -59,6 +77,18 @@
}
@Test
+ public void checkByteBufChange1bit() { //byte buffer, change one bit
+ String keyStr = "The quick brown fox jumps over the lazy eog";
+ byte[] key = keyStr.getBytes(UTF_8);
+
+ //Should be:
+ long h1 = 0x362108102c62d1c9L;
+ long h2 = 0x3285cd100292b305L;
+
+ checkHashByteBuf(key, h1, h2);
+ }
+
+ @Test
public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8
String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
byte[] key = keyStr.getBytes(UTF_8);
@@ -71,6 +101,18 @@
}
@Test
+ public void checkByteBufRemainderLt8() { //byte buffer, test a remainder < 8
+ String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
+ byte[] key = keyStr.getBytes(UTF_8);
+
+ //Should be;
+ long h1 = 0x9c8205300e612fc4L;
+ long h2 = 0xcbc0af6136aa3df9L;
+
+ checkHashByteBuf(key, h1, h2);
+ }
+
+ @Test
public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8
String keyStr = "The quick brown fox jumps over the lazy1";
byte[] key = keyStr.getBytes(UTF_8);
@@ -80,7 +122,18 @@
long h2 = 0xbdbf05f8da0f0392L;
Assert.assertEquals(result[0], h1);
Assert.assertEquals(result[1], h2);
+ }
+ @Test
+ public void checkByteBufReaminderEQ8() { //byte buffer, test a remainder = 8
+ String keyStr = "The quick brown fox jumps over the lazy1";
+ byte[] key = keyStr.getBytes(UTF_8);
+
+ //Should be:
+ long h1 = 0xe3301a827e5cdfe3L;
+ long h2 = 0xbdbf05f8da0f0392L;
+
+ checkHashByteBuf(key, h1, h2);
}
/**
@@ -190,6 +243,21 @@
Assert.assertEquals(result[1], h2);
}
+ @Test
+ public void checkByteBufAllOnesZeros() { //byte[], test a ones byte and a zeros byte
+ byte[] key = {
+ 0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e,
+ 0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, 0x65,
+ 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, 0x67,
+ (byte) 0xff, 0x64, 0x6f, 0x67, 0x00
+ };
+
+ long h1 = 0xe88abda785929c9eL;
+ long h2 = 0x96b98587cacc83d6L;
+
+ checkHashByteBuf(key, h1, h2);
+ }
+
/**
* This test demonstrates that the hash of byte[], char[], int[], or long[] will produce the
* same hash result if, and only if, all the arrays have the same exact length in bytes, and if
@@ -204,6 +272,13 @@
println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
println(org.apache.datasketches.Util.longToHexBytes(out1[1]));
+ println("ByteBuffer");
+ ByteBuffer bBuf = ByteBuffer.wrap(bArr);
+ out = hash(bBuf, 0L);
+ Assert.assertEquals(out, out1);
+ println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
+ println(org.apache.datasketches.Util.longToHexBytes(out1[1]));
+
println("Chars");
char[] cArr = {0X0201, 0X0403, 0X0605, 0X0807, 0X0a09, 0X0c0b, 0X0e0d, 0X100f,
0X1211, 0X1413, 0X1615, 0X1817};
@@ -267,6 +342,42 @@
return out;
}
+ /**
+ * Tests {@link MurmurHash3#hash(ByteBuffer, long)} on the provided key.
+ *
+ * @param key byte array to hash
+ * @param h1 first half of expected hash
+ * @param h2 second half of expected hash
+ */
+ private static void checkHashByteBuf(byte[] key, long h1, long h2) {
+ // Include dummy byte at start, end to make sure position, limit are respected.
+ ByteBuffer bigEndianBuf = ByteBuffer.allocate(key.length + 2).order(ByteOrder.BIG_ENDIAN);
+ bigEndianBuf.position(1);
+ bigEndianBuf.put(key);
+ bigEndianBuf.limit(1 + key.length);
+ bigEndianBuf.position(1);
+
+ // Test with little endian too.
+ ByteBuffer littleEndianBuf = bigEndianBuf.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+
+ long[] result1 = MurmurHash3.hash(bigEndianBuf, 0);
+ long[] result2 = MurmurHash3.hash(littleEndianBuf, 0);
+
+ // Position, limit, order should not be changed.
+ Assert.assertEquals(1, bigEndianBuf.position());
+ Assert.assertEquals(1, littleEndianBuf.position());
+ Assert.assertEquals(1 + key.length, bigEndianBuf.limit());
+ Assert.assertEquals(1 + key.length, littleEndianBuf.limit());
+ Assert.assertEquals(ByteOrder.BIG_ENDIAN, bigEndianBuf.order());
+ Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, littleEndianBuf.order());
+
+ // Check the actual hashes.
+ Assert.assertEquals(result1[0], h1);
+ Assert.assertEquals(result1[1], h2);
+ Assert.assertEquals(result2[0], h1);
+ Assert.assertEquals(result2[1], h2);
+ }
+
@Test
public void printlnTest() {
println("PRINTING: "+this.getClass().getName());
diff --git a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
index c3ee04b..9bbbeac 100644
--- a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
+++ b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
@@ -26,6 +26,8 @@
import org.apache.datasketches.memory.WritableMemory;
+import java.nio.ByteBuffer;
+
/**
* @author Lee Rhodes
*
@@ -40,6 +42,10 @@
sk.update(byteArr);
sk.update(new byte[] {});
sk.update(new byte[] {0, 1, 2, 3});
+ ByteBuffer byteBuf = null;
+ sk.update(byteBuf);
+ sk.update(ByteBuffer.wrap(new byte[] {}));
+ sk.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
char[] charArr = null;
sk.update(charArr);
sk.update(new char[] {});
@@ -66,6 +72,10 @@
u.update(byteArr1);
u.update(new byte[] {});
u.update(new byte[] {0, 1, 2, 3});
+ ByteBuffer byteBuf1 = null;
+ u.update(byteBuf1);
+ u.update(ByteBuffer.wrap(new byte[] {}));
+ u.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
char[] charArr1 = null;
u.update(charArr1);
u.update(new char[] {});