move KS test from Util to its own class
diff --git a/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
new file mode 100644
index 0000000..848272f
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantiles;
+
+/**
+ * Kolmogorov-Smirnov Test
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
+ */
+final class KolmogorovSmirnov {
+
+ /**
+ * Computes the raw delta area between two quantile sketches for the
+ * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double)
+ * Kolmogorov-Smirnov Test}
+ * method.
+ * @param sketch1 Input DoubleSketch 1
+ * @param sketch2 Input DoubleSketch 2
+ * @return the raw delta area between two quantile sketches
+ */
+ public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) {
+ final DoublesAuxiliary p = new DoublesAuxiliary(sketch1);
+ final DoublesAuxiliary q = new DoublesAuxiliary(sketch2);
+
+ final double[] pSamplesArr = p.auxSamplesArr_;
+ final double[] qSamplesArr = q.auxSamplesArr_;
+ final long[] pCumWtsArr = p.auxCumWtsArr_;
+ final long[] qCumWtsArr = q.auxCumWtsArr_;
+ final int pSamplesArrLen = pSamplesArr.length;
+ final int qSamplesArrLen = qSamplesArr.length;
+
+ final double n1 = sketch1.getN();
+ final double n2 = sketch2.getN();
+
+ double deltaArea = 0;
+ int i = 0;
+ int j = 0;
+
+ while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) {
+ deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2));
+ if (pSamplesArr[i] < qSamplesArr[j]) {
+ i++;
+ } else if (qSamplesArr[j] < pSamplesArr[i]) {
+ j++;
+ } else {
+ i++;
+ j++;
+ }
+ }
+
+ deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2));
+ return deltaArea;
+ }
+
+ /**
+ * Computes the adjusted delta area threshold for the
+ * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double) Kolmogorov-Smirnov Test}
+ * method.
+ * This adjusts the computed threshold by the error epsilons of the two given sketches.
+ * @param sketch1 Input DoubleSketch 1
+ * @param sketch2 Input DoubleSketch 2
+ * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
+ * @return the adjusted threshold to be compared with the raw delta area.
+ */
+ public static double computeKSThreshold(final DoublesSketch sketch1,
+ final DoublesSketch sketch2,
+ final double tgtPvalue) {
+ final double r1 = sketch1.getRetainedItems();
+ final double r2 = sketch2.getRetainedItems();
+ final double alpha = tgtPvalue;
+ final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha));
+ final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2));
+ final double eps1 = sketch1.getNormalizedRankError(false);
+ final double eps2 = sketch2.getNormalizedRankError(false);
+ return deltaAreaThreshold + eps1 + eps2;
+ }
+
+ /**
+ * Performs the Kolmogorov-Smirnov Test between two quantiles sketches.
+ * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
+ * this will return false.
+ * @param sketch1 Input DoubleSketch 1
+ * @param sketch2 Input DoubleSketch 2
+ * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
+ * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
+ * reflect the same underlying distribution) using the provided tgtPValue.
+ */
+ public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1,
+ final DoublesSketch sketch2, final double tgtPvalue) {
+ final double delta = computeKSDelta(sketch1, sketch2);
+ final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue);
+ return delta > thresh;
+ }
+
+}
diff --git a/src/main/java/org/apache/datasketches/quantiles/Util.java b/src/main/java/org/apache/datasketches/quantiles/Util.java
index c60bc21..a61a2c2 100644
--- a/src/main/java/org/apache/datasketches/quantiles/Util.java
+++ b/src/main/java/org/apache/datasketches/quantiles/Util.java
@@ -73,52 +73,9 @@
* @param sketch2 Input DoubleSketch 2
* @return the raw delta area between two quantile sketches
*/
- public static double computeKSDelta(final DoublesSketch sketch1,
- final DoublesSketch sketch2) {
- final DoublesAuxiliary p = new DoublesAuxiliary(sketch1);
- final DoublesAuxiliary q = new DoublesAuxiliary(sketch2);
-
- final double[] pSamplesArr = p.auxSamplesArr_;
- final double[] qSamplesArr = q.auxSamplesArr_;
- final long[] pCumWtsArr = p.auxCumWtsArr_;
- final long[] qCumWtsArr = q.auxCumWtsArr_;
- final int pSamplesArrLen = pSamplesArr.length;
- final int qSamplesArrLen = qSamplesArr.length;
-
- final double n1 = sketch1.getN();
- final double n2 = sketch2.getN();
-
- //Compute D from the two distributions
- double deltaArea = 0.0;
- int i = getNextIndex(pSamplesArr, -1);
- int j = getNextIndex(qSamplesArr, -1);
-
- // We're done if either array reaches the end
- while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) {
- final double pSample = pSamplesArr[i];
- final double qSample = qSamplesArr[j];
- final long pWt = pCumWtsArr[i];
- final long qWt = qCumWtsArr[j];
- final double pNormWt = pWt / n1;
- final double qNormWt = qWt / n2;
- final double pMinusQ = Math.abs(pNormWt - qNormWt);
- final double curD = deltaArea;
- deltaArea = Math.max(curD, pMinusQ);
-
- //Increment i or j or both
- if (pSample == qSample) {
- i = getNextIndex(pSamplesArr, i);
- j = getNextIndex(qSamplesArr, j);
- } else if (pSample < qSample) {
- i = getNextIndex(pSamplesArr, i);
- } else {
- j = getNextIndex(qSamplesArr, j);
- }
- }
-
- //This is D, the delta difference in area of the two distributions
- deltaArea = Math.max(deltaArea, Math.abs((pCumWtsArr[i] / n1) - (qCumWtsArr[j] / n2)));
- return deltaArea;
+ @Deprecated
+ public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) {
+ return KolmogorovSmirnov.computeKSDelta(sketch1, sketch2);
}
/**
@@ -132,19 +89,11 @@
* @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05.
* @return the adjusted threshold to be compared with the raw delta area.
*/
+ @Deprecated
public static double computeKSThreshold(final DoublesSketch sketch1,
final DoublesSketch sketch2,
final double tgtPvalue) {
- final double r1 = sketch1.getRetainedItems();
- final double r2 = sketch2.getRetainedItems();
- final double alpha = tgtPvalue;
- final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha));
- final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2));
- final double eps1 = Util.getNormalizedRankError(sketch1.getK(), false);
- final double eps2 = Util.getNormalizedRankError(sketch2.getK(), false);
-
- final double adjDeltaAreaThreshold = deltaAreaThreshold + eps1 + eps2;
- return adjDeltaAreaThreshold;
+ return KolmogorovSmirnov.computeKSThreshold(sketch1, sketch2, tgtPvalue);
}
/**
@@ -157,32 +106,15 @@
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
* reflect the same underlying distribution) using the provided tgtPValue.
*/
+ @Deprecated
public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1,
final DoublesSketch sketch2, final double tgtPvalue) {
- final double delta = computeKSDelta(sketch1, sketch2);
- final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue);
- return delta > thresh;
- }
-
- private static final int getNextIndex(final double[] samplesArr, final int stIdx) {
- int idx = stIdx + 1;
- final int samplesArrLen = samplesArr.length;
-
- if (idx >= samplesArrLen) { return samplesArrLen; }
-
- // if we have a sequence of equal values, use the last one of the sequence
- final double val = samplesArr[idx];
- int nxtIdx = idx + 1;
- while ((nxtIdx < samplesArrLen) && (samplesArr[nxtIdx] == val)) {
- idx = nxtIdx;
- ++nxtIdx;
- }
- return idx;
+ return KolmogorovSmirnov.kolmogorovSmirnovTest(sketch1, sketch2, tgtPvalue);
}
/**
* Gets the normalized rank error given k and pmf for the Quantiles DoubleSketch and ItemsSketch.
- * @param k the configuation parameter
+ * @param k the configuration parameter
* @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
* Otherwise, it is the "single-sided" normalized rank error for all the other queries.
* @return if pmf is true, the normalized rank error for the getPMF() function.
diff --git a/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java
new file mode 100644
index 0000000..e69d688
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantiles;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+
+import java.util.Random;
+
+import org.testng.annotations.Test;
+
+import org.apache.datasketches.SketchesArgumentException;
+
+@SuppressWarnings("javadoc")
+public class KolmogorovSmirnovTest {
+
+ @Test
+ public void checkKomologorovSmirnovStatistic1() {
+ final int k = 256;
+ final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+ final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+
+ final Random rand = new Random();
+
+ final int n = (3 * k) - 1;
+ for (int i = 0; i < n; ++i) {
+ final double x = rand.nextGaussian();
+ s1.update(x + 100);
+ s2.update(x);
+ }
+
+ assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 1.0, 1E-6);
+ println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2));
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic2() {
+ final int k = 256;
+ final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+ final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+
+ final Random rand = new Random();
+
+ final int n = (3 * k) - 1;
+ for (int i = 0; i < n; ++i) {
+ final double x = rand.nextGaussian();
+ s1.update(x);
+ s2.update(x);
+ }
+
+ assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 0, .01);
+ println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2));
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic3() {
+ final int k = 2048;
+ final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+ final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+ final double tgtPvalue = .05;
+
+ final Random rand = new Random();
+
+ final int n = (3 * k) - 1;
+ for (int i = 0; i < n; ++i) {
+ final double x = rand.nextGaussian();
+ s1.update(x + .05);
+ s2.update(x);
+ }
+
+ double D = KolmogorovSmirnov.computeKSDelta(s1, s2);
+ double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue);
+ final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
+ println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
+ + "\nNull Hypoth Rejected = " + reject);
+ assertFalse(reject);
+ }
+
+ @Test
+ public void checkKomologorovSmirnovStatistic4() {
+ final int k = 8192;
+ final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
+ final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
+ final double tgtPvalue = .05;
+
+ final Random rand = new Random();
+
+ final int n = (3 * k) - 1;
+ for (int i = 0; i < n; ++i) {
+ final double x = rand.nextGaussian();
+ s1.update(x + .05);
+ s2.update(x);
+ }
+
+ double D = KolmogorovSmirnov.computeKSDelta(s1, s2);
+ double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue);
+ final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
+ println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
+ + "\nNull Hypoth Rejected = " + reject);
+ assertTrue(reject);
+ }
+
+
+ @Test
+ public void printlnTest() {
+ println("PRINTING: "+this.getClass().getName());
+ }
+
+ /**
+ * @param s value to print
+ */
+ static void println(String s) {
+ //System.out.println(s); //disable here
+ }
+
+}
diff --git a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
index ce50160..e01ceab 100644
--- a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
@@ -20,11 +20,9 @@
package org.apache.datasketches.quantiles;
import static org.testng.Assert.assertEquals;
-import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import java.util.Arrays;
-import java.util.Random;
import org.testng.annotations.Test;
@@ -309,93 +307,6 @@
// exhaustiveMain(new String[] {"10", "100"});
// }
- @Test
- public void checkKomologorovSmirnovStatistic1() {
- final int k = 256;
- final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
- final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-
- final Random rand = new Random();
-
- final int n = (3 * k) - 1;
- for (int i = 0; i < n; ++i) {
- final double x = rand.nextGaussian();
- s1.update(x + 100);
- s2.update(x);
- }
-
- assertEquals(Util.computeKSDelta(s1, s2), 1.0, 1.0 + 1E-6);
- //println("D = " + Util.computeKSDelta(s1, s2));
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic2() {
- final int k = 256;
- final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
- final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
-
- final Random rand = new Random();
-
- final int n = (3 * k) - 1;
- for (int i = 0; i < n; ++i) {
- final double x = rand.nextGaussian();
- s1.update(x);
- s2.update(x);
- }
-
- assertEquals(Util.computeKSDelta(s1, s2), 0, .01);
- //println("D = " + Util.computeKSDelta(s1, s2));
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic3() {
- final int k = 2048;
- final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
- final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
- final double tgtPvalue = .05;
-
- final Random rand = new Random();
-
- final int n = (3 * k) - 1;
- for (int i = 0; i < n; ++i) {
- final double x = rand.nextGaussian();
- s1.update(x + .05);
- s2.update(x);
- }
-
- double D = Util.computeKSDelta(s1, s2);
- double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue);
- final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
- println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
- + "\nNull Hypoth Rejected = " + reject);
- assertFalse(reject);
- }
-
- @Test
- public void checkKomologorovSmirnovStatistic4() {
- final int k = 8192;
- final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build();
- final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build();
- final double tgtPvalue = .05;
-
- final Random rand = new Random();
-
- final int n = (3 * k) - 1;
- for (int i = 0; i < n; ++i) {
- final double x = rand.nextGaussian();
- s1.update(x + .05);
- s2.update(x);
- }
-
- double D = Util.computeKSDelta(s1, s2);
- double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue);
- final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue);
- println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh
- + "\nNull Hypoth Rejected = " + reject);
- assertTrue(reject);
- }
-
-
@Test
public void printlnTest() {
println("PRINTING: "+this.getClass().getName());