new Jaccard Similarity UDF + test
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/JaccardSimilarity.java b/src/main/java/com/yahoo/sketches/pig/theta/JaccardSimilarity.java
new file mode 100644
index 0000000..56b294b
--- /dev/null
+++ b/src/main/java/com/yahoo/sketches/pig/theta/JaccardSimilarity.java
@@ -0,0 +1,108 @@
+package com.yahoo.sketches.pig.theta;
+
+import com.yahoo.memory.Memory;
+import com.yahoo.sketches.theta.Sketch;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.data.DataByteArray;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+
+import java.io.IOException;
+
+import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
+import static com.yahoo.sketches.pig.theta.PigUtil.extractFieldAtIndex;
+
+/**
+ * This is a Pig UDF that performs the JaccardSimilarity Operation on two given
+ * Sketches.
+ *
+ * @author eshcar
+ */
+public class JaccardSimilarity extends EvalFunc<Tuple> {
+
+ // @formatter:off
+ /**
+ * Top Level Exec Function.
+ * <p>
+ * This method accepts a <b>Sketch JaccardSimilarityAB Input Tuple</b> and returns a
+ * <b>Tuple {LowerBound, Estimate, UpperBound} </b> of the Jaccard ratio.
+ * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
+ * </p>
+ *
+ * <b>Sketch JaccardSimilarityAB Input Tuple</b>
+ * <ul>
+ * <li>Tuple: TUPLE (Must contain 2 fields): <br>
+ * Java data type: Pig DataType: Description
+ * <ul>
+ * <li>index 0: DataByteArray: BYTEARRAY: Sketch A</li>
+ * <li>index 1: DataByteArray: BYTEARRAY: Sketch B</li>
+ * </ul>
+ * </li>
+ * </ul>
+ *
+ * <p>
+ * Any other input tuple will throw an exception!
+ * </p>
+ *
+ * <b>Tuple {LowerBound, Estimate, UpperBound}</b>
+ * <ul>
+ * <li>Tuple: TUPLE (Contains 3 fields)
+ * <ul>
+ * <li>index 0: Double: DOUBLE = The lower bound of the Jaccard Similarity.</li>
+ * <li>index 1: Double: DOUBLE = The estimation of the Jaccard Similarity.</li>
+ * <li>index 2: Double: DOUBLE = The upper bound of the Jaccard Similarity.</li>
+ * </ul>
+ * </li>
+ * </ul>
+ *
+ * @throws ExecException from Pig.
+ */
+ // @formatter:on
+
+ @Override //TOP LEVEL EXEC
+ public Tuple exec(final Tuple inputTuple) throws IOException {
+ //The exec is a stateless function. It operates on the input and returns a result.
+ // It can only call static functions.
+ final Object objA = extractFieldAtIndex(inputTuple, 0);
+ Sketch sketchA = null;
+ if (objA != null) {
+ final DataByteArray dbaA = (DataByteArray)objA;
+ final Memory srcMem = Memory.wrap(dbaA.get());
+ sketchA = Sketch.wrap(srcMem, DEFAULT_UPDATE_SEED);
+ }
+ final Object objB = extractFieldAtIndex(inputTuple, 1);
+ Sketch sketchB = null;
+ if (objB != null) {
+ final DataByteArray dbaB = (DataByteArray)objB;
+ final Memory srcMem = Memory.wrap(dbaB.get());
+ sketchB = Sketch.wrap(srcMem, DEFAULT_UPDATE_SEED);
+ }
+
+ final double[] jaccardTupple =
+ com.yahoo.sketches.theta.JaccardSimilarity.jaccard(sketchA, sketchB);
+ return doubleArrayToTuple(jaccardTupple);
+ }
+
+ /**
+ * Serialize a double array into a Tuple
+ *
+ * @param doubleArray The doubles array to serialize
+ * @return Double Tuple.
+ */
+ static private Tuple doubleArrayToTuple(final double[] doubleArray) {
+ if(doubleArray == null || doubleArray.length == 0) return null;
+ int arraySize = doubleArray.length;
+ final Tuple outputTuple = TupleFactory.getInstance().newTuple(arraySize);
+ for (int i = 0; i < arraySize; i++) {
+ try {
+ outputTuple.set(i, doubleArray[i]);
+ }
+ catch (final IOException e) {
+ throw new IllegalArgumentException("IOException thrown: " + e);
+ }
+ }
+ return outputTuple;
+ }
+
+}
diff --git a/src/test/java/com/yahoo/sketches/pig/theta/JaccardSimilarityTest.java b/src/test/java/com/yahoo/sketches/pig/theta/JaccardSimilarityTest.java
new file mode 100644
index 0000000..d6724bd
--- /dev/null
+++ b/src/test/java/com/yahoo/sketches/pig/theta/JaccardSimilarityTest.java
@@ -0,0 +1,66 @@
+package com.yahoo.sketches.pig.theta;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+
+import static com.yahoo.sketches.pig.PigTestingUtil.createDbaFromQssRange;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertNotNull;
+
+/**
+ * @author eshcar
+ */
+public class JaccardSimilarityTest {
+
+ @Test
+ public void checkNullCombinations() throws IOException {
+ EvalFunc<Tuple> jaccardFunc = new JaccardSimilarity();
+
+ Tuple inputTuple, resultTuple;
+ Double[] est;
+ //Two nulls
+ inputTuple = TupleFactory.getInstance().newTuple(2);
+ resultTuple = jaccardFunc.exec(inputTuple);
+ assertNotNull(resultTuple);
+ assertEquals(resultTuple.size(), 3);
+ for (Object d : resultTuple.getAll()) {
+ assertEquals(d, 0.0);
+ }
+
+ //A is null
+ inputTuple = TupleFactory.getInstance().newTuple(2);
+ inputTuple.set(1, createDbaFromQssRange(256, 0, 128));
+ resultTuple = jaccardFunc.exec(inputTuple);
+ assertNotNull(resultTuple);
+ assertEquals(resultTuple.size(), 3);
+ for (Object d : resultTuple.getAll()) {
+ assertEquals(d, 0.0);
+ }
+
+ //A is valid, B is null
+ inputTuple = TupleFactory.getInstance().newTuple(2);
+ inputTuple.set(0, createDbaFromQssRange(256, 0, 256));
+ resultTuple = jaccardFunc.exec(inputTuple);
+ assertNotNull(resultTuple);
+ assertEquals(resultTuple.size(), 3);
+ for (Object d : resultTuple.getAll()) {
+ assertEquals(d, 0.0);
+ }
+
+ //Both valid
+ inputTuple = TupleFactory.getInstance().newTuple(2);
+ inputTuple.set(0, createDbaFromQssRange(256, 0, 256));
+ inputTuple.set(1, createDbaFromQssRange(256, 0, 128));
+ resultTuple = jaccardFunc.exec(inputTuple);
+ assertNotNull(resultTuple);
+ assertEquals(resultTuple.size(), 3);
+ assertEquals(resultTuple.get(0), 0.5);
+ assertEquals(resultTuple.get(1), 0.5);
+ assertEquals(resultTuple.get(2), 0.5);
+ }
+
+}