Merge pull request #354 from apache/move_ks_test

move KS test from Util to its own class
diff --git a/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
new file mode 100644
index 0000000..848272f
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantiles;
+
+/**
+ * Kolmogorov-Smirnov Test
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
+ */
+final class KolmogorovSmirnov {
+
+  /**
+   * Computes the raw delta area between two quantile sketches for the
+   * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double)
+   * Kolmogorov-Smirnov Test}
+   * method.
+   * @param sketch1 Input DoubleSketch 1
+   * @param sketch2 Input DoubleSketch 2
+   * @return the raw delta area between two quantile sketches
+   */
+  public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) {
+    final DoublesAuxiliary p = new DoublesAuxiliary(sketch1);
+    final DoublesAuxiliary q = new DoublesAuxiliary(sketch2);
+
+    final double[] pSamplesArr = p.auxSamplesArr_;
+    final double[] qSamplesArr = q.auxSamplesArr_;
+    final long[] pCumWtsArr = p.auxCumWtsArr_;
+    final long[] qCumWtsArr = q.auxCumWtsArr_;
+    final int pSamplesArrLen = pSamplesArr.length;
+    final int qSamplesArrLen = qSamplesArr.length;
+
+    final double n1 = sketch1.getN();
+    final double n2 = sketch2.getN();
+
+    double deltaArea = 0;
+    int i = 0;
+    int j = 0;
+
+    while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) {
+      deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2));
+      if (pSamplesArr[i] < qSamplesArr[j]) {
+        i++;
+      } else if (qSamplesArr[j] < pSamplesArr[i]) {
+        j++;
+      } else {
+    	i++;
+        j++;
+      }
+    }
+
+    deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2));
+    return deltaArea;
+  }
+
+  /**
+   * Computes the adjusted delta area threshold for the
+   * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double) Kolmogorov-Smirnov Test}
+   * method.
+   * This adjusts the computed threshold by the error epsilons of the two given sketches.
+   * @param sketch1 Input DoubleSketch 1
+   * @param sketch2 Input DoubleSketch 2
+   * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
+   * @return the adjusted threshold to be compared with the raw delta area.
+   */
+  public static double computeKSThreshold(final DoublesSketch sketch1,
+                                          final DoublesSketch sketch2,
+                                          final double tgtPvalue) {
+    final double r1 = sketch1.getRetainedItems();
+    final double r2 = sketch2.getRetainedItems();
+    final double alpha = tgtPvalue;
+    final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha));
+    final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2));
+    final double eps1 = sketch1.getNormalizedRankError(false);
+    final double eps2 = sketch2.getNormalizedRankError(false);
+    return deltaAreaThreshold + eps1 + eps2;
+  }
+
+  /**
+   * Performs the Kolmogorov-Smirnov Test between two quantiles sketches.
+   * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
+   * this will return false.
+   * @param sketch1 Input DoubleSketch 1
+   * @param sketch2 Input DoubleSketch 2
+   * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
+   * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
+   * reflect the same underlying distribution) using the provided tgtPValue.
+   */
+  public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1,
+      final DoublesSketch sketch2, final double tgtPvalue) {
+    final double delta = computeKSDelta(sketch1, sketch2);
+    final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue);
+    return delta > thresh;
+  }
+
+}
diff --git a/src/main/java/org/apache/datasketches/quantiles/Util.java b/src/main/java/org/apache/datasketches/quantiles/Util.java
index c60bc21..a61a2c2 100644
--- a/src/main/java/org/apache/datasketches/quantiles/Util.java
+++ b/src/main/java/org/apache/datasketches/quantiles/Util.java
@@ -73,52 +73,9 @@
    * @param sketch2 Input DoubleSketch 2
    * @return the raw delta area between two quantile sketches
    */
-  public static double computeKSDelta(final DoublesSketch sketch1,
-      final DoublesSketch sketch2) {
-    final DoublesAuxiliary p = new DoublesAuxiliary(sketch1);
-    final DoublesAuxiliary q = new DoublesAuxiliary(sketch2);
-
-    final double[] pSamplesArr = p.auxSamplesArr_;
-    final double[] qSamplesArr = q.auxSamplesArr_;
-    final long[] pCumWtsArr = p.auxCumWtsArr_;
-    final long[] qCumWtsArr = q.auxCumWtsArr_;
-    final int pSamplesArrLen = pSamplesArr.length;
-    final int qSamplesArrLen = qSamplesArr.length;
-
-    final double n1 = sketch1.getN();
-    final double n2 = sketch2.getN();
-
-    //Compute D from the two distributions
-    double deltaArea = 0.0;
-    int i = getNextIndex(pSamplesArr, -1);
-    int j = getNextIndex(qSamplesArr, -1);
-
-    // We're done if either array reaches the end
-    while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) {
-      final double pSample = pSamplesArr[i];
-      final double qSample = qSamplesArr[j];
-      final long pWt = pCumWtsArr[i];
-      final long qWt = qCumWtsArr[j];
-      final double pNormWt = pWt / n1;
-      final double qNormWt = qWt / n2;
-      final double pMinusQ = Math.abs(pNormWt - qNormWt);
-      final double curD = deltaArea;
-      deltaArea = Math.max(curD, pMinusQ);
-
-      //Increment i or j or both
-      if (pSample == qSample) {
-        i = getNextIndex(pSamplesArr, i);
-        j = getNextIndex(qSamplesArr, j);
-      } else if (pSample < qSample) {
-        i = getNextIndex(pSamplesArr, i);
-      } else {
-        j = getNextIndex(qSamplesArr, j);
-      }
-    }
-
-    //This is D, the delta difference in area of the two distributions
-    deltaArea = Math.max(deltaArea, Math.abs((pCumWtsArr[i] / n1) - (qCumWtsArr[j] / n2)));
-    return deltaArea;
+  @Deprecated
+  public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) {
+    return KolmogorovSmirnov.computeKSDelta(sketch1, sketch2);
   }
 
   /**
@@ -132,19 +89,11 @@
    * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
    * @return the adjusted threshold to be compared with the raw delta area.
    */
+  @Deprecated
   public static double computeKSThreshold(final DoublesSketch sketch1,
                                            final DoublesSketch sketch2,
                                            final double tgtPvalue) {
-    final double r1 = sketch1.getRetainedItems();
-    final double r2 = sketch2.getRetainedItems();
-    final double alpha = tgtPvalue;
-    final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha));
-    final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2));
-    final double eps1 = Util.getNormalizedRankError(sketch1.getK(), false);
-    final double eps2 = Util.getNormalizedRankError(sketch2.getK(), false);
-
-    final double adjDeltaAreaThreshold = deltaAreaThreshold + eps1 + eps2;
-    return adjDeltaAreaThreshold;
+	  return KolmogorovSmirnov.computeKSThreshold(sketch1, sketch2, tgtPvalue);
   }
 
   /**
@@ -157,32 +106,15 @@
    * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
    * reflect the same underlying distribution) using the provided tgtPValue.
    */
+  @Deprecated
   public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1,
       final DoublesSketch sketch2, final double tgtPvalue) {
-    final double delta = computeKSDelta(sketch1, sketch2);
-    final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue);
-    return delta > thresh;
-  }
-
-  private static final int getNextIndex(final double[] samplesArr, final int stIdx) {
-    int idx = stIdx + 1;
-    final int samplesArrLen = samplesArr.length;
-
-    if (idx >= samplesArrLen) { return samplesArrLen; }
-
-    // if we have a sequence of equal values, use the last one of the sequence
-    final double val = samplesArr[idx];
-    int nxtIdx = idx + 1;
-    while ((nxtIdx < samplesArrLen) && (samplesArr[nxtIdx] == val)) {
-      idx = nxtIdx;
-      ++nxtIdx;
-    }
-    return idx;
+	  return KolmogorovSmirnov.kolmogorovSmirnovTest(sketch1, sketch2, tgtPvalue);
   }
 
   /**
    * Gets the normalized rank error given k and pmf for the Quantiles DoubleSketch and ItemsSketch.
-   * @param k the configuation parameter
+   * @param k the configuration parameter
    * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
    * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
    * @return if pmf is true, the normalized rank error for the getPMF() function.
diff --git a/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java
new file mode 100644
index 0000000..e69d688
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantiles;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+
+import java.util.Random;
+
+import org.testng.annotations.Test;
+
+import org.apache.datasketches.SketchesArgumentException;
+
+@SuppressWarnings("javadoc")
+public class KolmogorovSmirnovTest {
+
+ @Test
+ public void checkKomologorovSmirnovStatistic1() {
+   final int k = 256;
+   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+
+   final Random rand = new Random();
+
+   final int n =  (3 * k) - 1;
+   for (int i = 0; i < n; ++i) {
+     final double x = rand.nextGaussian();
+     s1.update(x + 100);
+     s2.update(x);
+   }
+
+   assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 1.0, 1E-6);
+   println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2));
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic2() {
+   final int k = 256;
+   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+
+   final Random rand = new Random();
+
+   final int n =  (3 * k) - 1;
+   for (int i = 0; i < n; ++i) {
+     final double x = rand.nextGaussian();
+     s1.update(x);
+     s2.update(x);
+   }
+
+   assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 0, .01);
+   println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2));
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic3() {
+   final int k = 2048;
+   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+   final double tgtPvalue = .05;
+
+   final Random rand = new Random();
+
+   final int n =  (3 * k) - 1;
+   for (int i = 0; i < n; ++i) {
+     final double x = rand.nextGaussian();
+     s1.update(x + .05);
+     s2.update(x);
+   }
+
+   double D = KolmogorovSmirnov.computeKSDelta(s1, s2);
+   double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue);
+   final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
+   println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
+       + "\nNull Hypoth Rejected = " + reject);
+   assertFalse(reject);
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic4() {
+   final int k = 8192;
+   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+   final double tgtPvalue = .05;
+
+   final Random rand = new Random();
+
+   final int n =  (3 * k) - 1;
+   for (int i = 0; i < n; ++i) {
+     final double x = rand.nextGaussian();
+     s1.update(x + .05);
+     s2.update(x);
+   }
+
+   double D = KolmogorovSmirnov.computeKSDelta(s1, s2);
+   double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue);
+   final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
+   println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
+       + "\nNull Hypoth Rejected = " + reject);
+   assertTrue(reject);
+ }
+
+
+  @Test
+  public void printlnTest() {
+    println("PRINTING: "+this.getClass().getName());
+  }
+
+  /**
+   * @param s value to print
+   */
+  static void println(String s) {
+    //System.out.println(s); //disable here
+  }
+
+}
diff --git a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
index ce50160..e01ceab 100644
--- a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
@@ -20,11 +20,9 @@
 package org.apache.datasketches.quantiles;
 
 import static org.testng.Assert.assertEquals;
-import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertTrue;
 
 import java.util.Arrays;
-import java.util.Random;
 
 import org.testng.annotations.Test;
 
@@ -309,93 +307,6 @@
 //    exhaustiveMain(new String[] {"10", "100"});
 //  }
 
- @Test
- public void checkKomologorovSmirnovStatistic1() {
-   final int k = 256;
-   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
-   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-
-   final Random rand = new Random();
-
-   final int n =  (3 * k) - 1;
-   for (int i = 0; i < n; ++i) {
-     final double x = rand.nextGaussian();
-     s1.update(x + 100);
-     s2.update(x);
-   }
-
-   assertEquals(Util.computeKSDelta(s1, s2), 1.0, 1.0 + 1E-6);
-   //println("D = " + Util.computeKSDelta(s1, s2));
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic2() {
-   final int k = 256;
-   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
-   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-
-   final Random rand = new Random();
-
-   final int n =  (3 * k) - 1;
-   for (int i = 0; i < n; ++i) {
-     final double x = rand.nextGaussian();
-     s1.update(x);
-     s2.update(x);
-   }
-
-   assertEquals(Util.computeKSDelta(s1, s2), 0, .01);
-   //println("D = " + Util.computeKSDelta(s1, s2));
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic3() {
-   final int k = 2048;
-   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
-   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-   final double tgtPvalue = .05;
-
-   final Random rand = new Random();
-
-   final int n =  (3 * k) - 1;
-   for (int i = 0; i < n; ++i) {
-     final double x = rand.nextGaussian();
-     s1.update(x + .05);
-     s2.update(x);
-   }
-
-   double D = Util.computeKSDelta(s1, s2);
-   double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue);
-   final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
-   println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
-       + "\nNull Hypoth Rejected = " + reject);
-   assertFalse(reject);
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic4() {
-   final int k = 8192;
-   final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
-   final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-   final double tgtPvalue = .05;
-
-   final Random rand = new Random();
-
-   final int n =  (3 * k) - 1;
-   for (int i = 0; i < n; ++i) {
-     final double x = rand.nextGaussian();
-     s1.update(x + .05);
-     s2.update(x);
-   }
-
-   double D = Util.computeKSDelta(s1, s2);
-   double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue);
-   final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
-   println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
-       + "\nNull Hypoth Rejected = " + reject);
-   assertTrue(reject);
- }
-
-
   @Test
   public void printlnTest() {
     println("PRINTING: "+this.getClass().getName());