Merge pull request #55 from packet23/jaccard-udf

EstimateSketchSimilarityUDF
diff --git a/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDF.java b/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDF.java
new file mode 100644
index 0000000..39a5982
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDF.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.hive.theta;
+
+import org.apache.datasketches.hive.common.BytesWritableHelper;
+import org.apache.datasketches.theta.JaccardSimilarity;
+import org.apache.datasketches.theta.Sketch;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.io.BytesWritable;
+
+import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
+
+/**
+ * Hive estimate sketch similarity UDF.
+ *
+ */
+public class EstimateSketchSimilarityUDF extends UDF {
+
+  /**
+   * Main logic called by hive. Computes the jaccard similarity of two sketches of same or different column.
+   *
+   * @param firstSketchBytes
+   *          first sketch to be compared.
+   * @param secondSketchBytes
+   *          second sketch to be compared.
+   * @return the estimate of similarity of two sketches
+   */
+  public double evaluate(final BytesWritable firstSketchBytes, final BytesWritable secondSketchBytes) {
+    Sketch firstSketch = null;
+    if (firstSketchBytes != null && firstSketchBytes.getLength() > 0) {
+      firstSketch = Sketch.wrap(BytesWritableHelper.wrapAsMemory(firstSketchBytes), DEFAULT_UPDATE_SEED);
+    }
+
+    Sketch secondSketch = null;
+    if (secondSketchBytes != null && secondSketchBytes.getLength() > 0) {
+      secondSketch = Sketch.wrap(BytesWritableHelper.wrapAsMemory(secondSketchBytes), DEFAULT_UPDATE_SEED);
+    }
+
+    double[] jaccard = JaccardSimilarity.jaccard(firstSketch, secondSketch);
+    return jaccard[1];
+  }
+}
diff --git a/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDFTest.java b/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDFTest.java
new file mode 100644
index 0000000..9efbcda
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/hive/theta/EstimateSketchSimilarityUDFTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.hive.theta;
+
+import org.apache.datasketches.theta.Sketches;
+import org.apache.datasketches.theta.UpdateSketch;
+import org.apache.hadoop.io.BytesWritable;
+import org.testng.annotations.Test;
+
+import static org.testng.AssertJUnit.assertEquals;
+
+@SuppressWarnings("javadoc")
+public class EstimateSketchSimilarityUDFTest {
+
+  @Test
+  public void evaluateNull() {
+    EstimateSketchSimilarityUDF testObject = new EstimateSketchSimilarityUDF();
+    double testResult = testObject.evaluate(null, null);
+    assertEquals(0.0, testResult);
+  }
+
+  @Test
+  public void evaluateEmpty() {
+    EstimateSketchSimilarityUDF testObject = new EstimateSketchSimilarityUDF();
+    double testResult = testObject.evaluate(new BytesWritable(), new BytesWritable());
+    assertEquals(0.0, testResult);
+  }
+
+  @Test
+  public void evaluateValidSketch() {
+    EstimateSketchSimilarityUDF testObject = new EstimateSketchSimilarityUDF();
+
+    UpdateSketch sketch1 = Sketches.updateSketchBuilder().setNominalEntries(1024).build();
+    for (int i = 0; i<128; i++) {
+      sketch1.update(i);
+    }
+
+    UpdateSketch sketch2 = Sketches.updateSketchBuilder().setNominalEntries(1024).build();
+    for (int i = 100; i<128; i++) {
+      sketch2.update(i);
+    }
+
+    BytesWritable input1 = new BytesWritable(sketch1.compact().toByteArray());
+    BytesWritable input2 = new BytesWritable(sketch2.compact().toByteArray());
+
+    double result = testObject.evaluate(input1, input2);
+
+    assertEquals(28.0 / 128.0, result);
+  }
+
+}