Merge pull request #353 from gianm/bytebuffer-hashing

Add ByteBuffer hashing methods to MurmurHash3, BaseHllSketch.
diff --git a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
index 3823656..c4deef6 100644
--- a/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
+++ b/src/main/java/org/apache/datasketches/hash/MurmurHash3.java
@@ -20,6 +20,8 @@
 package org.apache.datasketches.hash;
 
 import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
 
 /**
@@ -177,6 +179,57 @@
     return hashState.finalMix128(k1, k2, chars << 1); //convert to bytes
   }
 
+  //--Hash of ByteBuffer------------------------------------------------
+  /**
+   * Returns a long array of size 2, which is a 128-bit hash of the input.
+   *
+   * @param buf The input byte buffer. Must be non-null and non-empty.
+   * @param seed A long valued seed.
+   * @return the hash.
+   */
+  public static long[] hash(final ByteBuffer buf, final long seed) {
+    final HashState hashState = new HashState(seed, seed);
+    final ByteBuffer littleEndianBuf;
+
+    if (buf.order() == ByteOrder.LITTLE_ENDIAN) {
+      littleEndianBuf = buf;
+    } else {
+      littleEndianBuf = buf.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+    }
+
+    final int bytes = littleEndianBuf.remaining(); //in bytes
+    final int offset = littleEndianBuf.position();
+
+    // Number of full 128-bit blocks of 16 bytes.
+    // Possible exclusion of a remainder of up to 15 bytes.
+    final int nblocks = bytes >>> 4; //bytes / 16
+
+    // Process the 128-bit blocks (the body) into the hash
+    for (int i = 0; i < nblocks; i++ ) { //16 bytes per block
+      final long k1 = getLong(littleEndianBuf, offset + (i << 4), 8); //0, 16, 32, ...
+      final long k2 = getLong(littleEndianBuf, offset + (i << 4) + 8, 8); //8, 24, 40, ...
+      hashState.blockMix128(k1, k2);
+    }
+
+    // Get the tail index, remainder length
+    final int tail = nblocks << 4; //16 bytes per block
+    final int rem = bytes - tail; // remainder bytes: 0,1,...,15
+
+    // Get the tail
+    final long k1;
+    final long k2;
+    if (rem > 8) { //k1 -> whole; k2 -> partial
+      k1 = getLong(littleEndianBuf, offset + tail, 8);
+      k2 = getLong(littleEndianBuf, offset + tail + 8, rem - 8);
+    }
+    else { //k1 -> whole, partial or 0; k2 == 0
+      k1 = (rem == 0) ? 0 : getLong(littleEndianBuf, offset + tail, rem);
+      k2 = 0;
+    }
+    // Mix the tail into the hash and return
+    return hashState.finalMix128(k1, k2, bytes);
+  }
+
   //--Hash of byte[]----------------------------------------------------
   /**
    * Returns a long array of size 2, which is a 128-bit hash of the input.
@@ -310,6 +363,43 @@
 
   //--Helper methods----------------------------------------------------
   /**
+   * Gets a long from the given byte buffer starting at the given position index and continuing for
+   * remainder (rem) bytes. The buffer must be in little-endian order. The bytes are extracted in
+   * little-endian order. The buffer endianness and limit are not checked.
+   *
+   * @param buf The given input byte buffer.
+   * @param index Zero-based index from the start of the byte array.
+   * @param rem Remainder bytes. An integer in the range [1,8].
+   * @return long
+   */
+  private static long getLong(final ByteBuffer buf, final int index, final int rem) {
+    long out = 0L;
+
+    switch (rem) {
+      case 8:
+        out = buf.getLong(index);
+        break;
+      case 7:
+        out ^= (buf.get(index + 6) & 0xFFL) << 48;
+      case 6:
+        out ^= (buf.get(index + 5) & 0xFFL) << 40;
+      case 5:
+        out ^= (buf.get(index + 4) & 0xFFL) << 32;
+      case 4:
+        out ^= (buf.get(index + 3) & 0xFFL) << 24;
+      case 3:
+        out ^= (buf.get(index + 2) & 0xFFL) << 16;
+      case 2:
+        out ^= (buf.get(index + 1) & 0xFFL) << 8;
+      case 1:
+        out ^= buf.get(index) & 0xFFL;
+    }
+
+    return out;
+  }
+
+  //--Helper methods----------------------------------------------------
+  /**
    * Gets a long from the given byte array starting at the given byte array index and continuing for
    * remainder (rem) bytes. The bytes are extracted in little-endian order. There is no limit
    * checking.
diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
index bfb6797..076cd0b 100644
--- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
@@ -27,6 +27,8 @@
 
 import org.apache.datasketches.memory.Memory;
 
+import java.nio.ByteBuffer;
+
 /**
  * Although this class is package-private, it provides a single place to define and document
  * the common public API for both HllSketch and Union.
@@ -328,6 +330,23 @@
   }
 
   /**
+   * Present the given byte buffer as a potential unique item.
+   * Bytes are read from the current position of the buffer until its limit.
+   * If the byte buffer is null or has no bytes remaining, no update attempt is made and the method returns.
+   *
+   * This method will not modify the position, mark, limit, or byte order of the buffer.
+   *
+   * Little-endian order is preferred, but not required. This method may perform better if the provided byte
+   * buffer is in little-endian order.
+   *
+   * @param data The given byte buffer.
+   */
+  public void update(final ByteBuffer data) {
+    if ((data == null) || (data.remaining() == 0)) { return; }
+    couponUpdate(coupon(hash(data, DEFAULT_UPDATE_SEED)));
+  }
+
+  /**
    * Present the given byte array as a potential unique item.
    * If the byte array is null or empty no update attempt is made and the method returns.
    *
diff --git a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
index 3bfbcd2..7fea076 100644
--- a/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
+++ b/src/test/java/org/apache/datasketches/hash/MurmurHash3Test.java
@@ -19,12 +19,18 @@
 
 package org.apache.datasketches.hash;
 
+import static java.nio.charset.StandardCharsets.UTF_16;
 import static org.apache.datasketches.hash.MurmurHash3.hash;
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+
 /**
  * Tests the MurmurHash3 against specific, known hash results given known
  * inputs obtained from the public domain C++ version 150.
@@ -47,6 +53,18 @@
   }
 
   @Test
+  public void checkByteBufRemainderGT8() { //byte buffer, remainder > 8
+    String keyStr = "The quick brown fox jumps over the lazy dog";
+    byte[] key = keyStr.getBytes(UTF_8);
+
+    //Should be:
+    long h1 = 0xe34bbc7bbc071b6cL;
+    long h2 = 0x7a433ca9c49a9347L;
+
+    checkHashByteBuf(key, h1, h2);
+  }
+
+  @Test
   public void checkByteArrChange1bit() { //byte[], change one bit
     String keyStr = "The quick brown fox jumps over the lazy eog";
     byte[] key = keyStr.getBytes(UTF_8);
@@ -59,6 +77,18 @@
   }
 
   @Test
+  public void checkByteBufChange1bit() { //byte buffer, change one bit
+    String keyStr = "The quick brown fox jumps over the lazy eog";
+    byte[] key = keyStr.getBytes(UTF_8);
+
+    //Should be:
+    long h1 = 0x362108102c62d1c9L;
+    long h2 = 0x3285cd100292b305L;
+
+    checkHashByteBuf(key, h1, h2);
+  }
+
+  @Test
   public void checkByteArrRemainderLt8() { //byte[], test a remainder < 8
     String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
     byte[] key = keyStr.getBytes(UTF_8);
@@ -71,6 +101,18 @@
   }
 
   @Test
+  public void checkByteBufRemainderLt8() { //byte buffer, test a remainder < 8
+    String keyStr = "The quick brown fox jumps over the lazy dogdogdog";
+    byte[] key = keyStr.getBytes(UTF_8);
+
+    //Should be;
+    long h1 = 0x9c8205300e612fc4L;
+    long h2 = 0xcbc0af6136aa3df9L;
+
+    checkHashByteBuf(key, h1, h2);
+  }
+
+  @Test
   public void checkByteArrReaminderEQ8() { //byte[], test a remainder = 8
     String keyStr = "The quick brown fox jumps over the lazy1";
     byte[] key = keyStr.getBytes(UTF_8);
@@ -80,7 +122,18 @@
     long h2 = 0xbdbf05f8da0f0392L;
     Assert.assertEquals(result[0], h1);
     Assert.assertEquals(result[1], h2);
+  }
 
+  @Test
+  public void checkByteBufReaminderEQ8() { //byte buffer, test a remainder = 8
+    String keyStr = "The quick brown fox jumps over the lazy1";
+    byte[] key = keyStr.getBytes(UTF_8);
+
+    //Should be:
+    long h1 = 0xe3301a827e5cdfe3L;
+    long h2 = 0xbdbf05f8da0f0392L;
+
+    checkHashByteBuf(key, h1, h2);
   }
 
   /**
@@ -190,6 +243,21 @@
     Assert.assertEquals(result[1], h2);
   }
 
+  @Test
+  public void checkByteBufAllOnesZeros() { //byte[], test a ones byte and a zeros byte
+    byte[] key = {
+      0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e,
+      0x20, 0x66, 0x6f, 0x78, 0x20, 0x6a, 0x75, 0x6d, 0x70, 0x73, 0x20, 0x6f, 0x76, 0x65,
+      0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x7a, 0x79, 0x20, 0x64, 0x6f, 0x67,
+      (byte) 0xff, 0x64, 0x6f, 0x67, 0x00
+    };
+
+    long h1 = 0xe88abda785929c9eL;
+    long h2 = 0x96b98587cacc83d6L;
+
+    checkHashByteBuf(key, h1, h2);
+  }
+
   /**
    * This test demonstrates that the hash of byte[], char[], int[], or long[] will produce the
    * same hash result if, and only if, all the arrays have the same exact length in bytes, and if
@@ -204,6 +272,13 @@
     println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
     println(org.apache.datasketches.Util.longToHexBytes(out1[1]));
 
+    println("ByteBuffer");
+    ByteBuffer bBuf = ByteBuffer.wrap(bArr);
+    out = hash(bBuf, 0L);
+    Assert.assertEquals(out, out1);
+    println(org.apache.datasketches.Util.longToHexBytes(out1[0]));
+    println(org.apache.datasketches.Util.longToHexBytes(out1[1]));
+
     println("Chars");
     char[] cArr = {0X0201, 0X0403, 0X0605, 0X0807,   0X0a09, 0X0c0b, 0X0e0d, 0X100f,
         0X1211, 0X1413, 0X1615, 0X1817};
@@ -267,6 +342,42 @@
     return out;
   }
 
+  /**
+   * Tests {@link MurmurHash3#hash(ByteBuffer, long)} on the provided key.
+   *
+   * @param key byte array to hash
+   * @param h1 first half of expected hash
+   * @param h2 second half of expected hash
+   */
+  private static void checkHashByteBuf(byte[] key, long h1, long h2) {
+    // Include dummy byte at start, end to make sure position, limit are respected.
+    ByteBuffer bigEndianBuf = ByteBuffer.allocate(key.length + 2).order(ByteOrder.BIG_ENDIAN);
+    bigEndianBuf.position(1);
+    bigEndianBuf.put(key);
+    bigEndianBuf.limit(1 + key.length);
+    bigEndianBuf.position(1);
+
+    // Test with little endian too.
+    ByteBuffer littleEndianBuf = bigEndianBuf.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+
+    long[] result1 = MurmurHash3.hash(bigEndianBuf, 0);
+    long[] result2 = MurmurHash3.hash(littleEndianBuf, 0);
+
+    // Position, limit, order should not be changed.
+    Assert.assertEquals(1, bigEndianBuf.position());
+    Assert.assertEquals(1, littleEndianBuf.position());
+    Assert.assertEquals(1 + key.length, bigEndianBuf.limit());
+    Assert.assertEquals(1 + key.length, littleEndianBuf.limit());
+    Assert.assertEquals(ByteOrder.BIG_ENDIAN, bigEndianBuf.order());
+    Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, littleEndianBuf.order());
+
+    // Check the actual hashes.
+    Assert.assertEquals(result1[0], h1);
+    Assert.assertEquals(result1[1], h2);
+    Assert.assertEquals(result2[0], h1);
+    Assert.assertEquals(result2[1], h2);
+  }
+
   @Test
   public void printlnTest() {
     println("PRINTING: "+this.getClass().getName());
diff --git a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
index e78e807..ea8ffef 100644
--- a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
+++ b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java
@@ -26,6 +26,8 @@
 
 import org.apache.datasketches.memory.WritableMemory;
 
+import java.nio.ByteBuffer;
+
 /**
  * @author Lee Rhodes
  *
@@ -40,6 +42,10 @@
     sk.update(byteArr);
     sk.update(new byte[] {});
     sk.update(new byte[] {0, 1, 2, 3});
+    ByteBuffer byteBuf = null;
+    sk.update(byteBuf);
+    sk.update(ByteBuffer.wrap(new byte[] {}));
+    sk.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
     char[] charArr = null;
     sk.update(charArr);
     sk.update(new char[] {});
@@ -66,6 +72,10 @@
     u.update(byteArr1);
     u.update(new byte[] {});
     u.update(new byte[] {0, 1, 2, 3});
+    ByteBuffer byteBuf1 = null;
+    u.update(byteBuf1);
+    u.update(ByteBuffer.wrap(new byte[] {}));
+    u.update(ByteBuffer.wrap(new byte[] {0, 1, 2, 3}));
     char[] charArr1 = null;
     u.update(charArr1);
     u.update(new char[] {});