| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.datasketches; |
| |
| /** |
| * This class enables the estimation of error bounds given a sample set size, the sampling |
| * probability theta, the number of standard deviations and a simple noDataSeen flag. This can |
| * be used to estimate error bounds for fixed threshold sampling as well as the error bounds |
| * calculations for sketches. |
| * |
| * @author Kevin Lang |
| */ |
| // BTW, the suffixes "NStar", "NPrimeB", and "NPrimeF" correspond to variables in the formal |
| // writeup of this scheme. |
| public final class BinomialBoundsN { |
| |
| private BinomialBoundsN() {} |
| |
| private static double[] deltaOfNumSDev = |
| { |
| 0.5000000000000000000, // not actually using this value |
| 0.1586553191586026479, |
| 0.0227502618904135701, |
| 0.0013498126861731796 |
| }; |
| |
| // our "classic" bounds, but now with continuity correction |
| |
| private static double contClassicLB(final double numSamplesF, final double theta, |
| final double numSDev) { |
| final double nHat = (numSamplesF - 0.5) / theta; |
| final double b = numSDev * Math.sqrt((1.0 - theta) / theta); |
| final double d = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat)); |
| final double center = nHat + (0.5 * (b * b)); |
| return (center - d); |
| } |
| |
| private static double contClassicUB(final double numSamplesF, final double theta, |
| final double numSDev) { |
| final double nHat = (numSamplesF + 0.5) / theta; |
| final double b = numSDev * Math.sqrt((1.0 - theta) / theta); |
| final double d = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat)); |
| final double center = nHat + (0.5 * (b * b)); |
| return (center + d); |
| } |
| |
| // This is a special purpose calculator for NStar, using a computational |
| // strategy inspired by its Bayesian definition. It is only appropriate |
| // for a very limited set of inputs. However, the procedure computeApproxBinoLB () |
| // below does in fact only call it for suitably limited inputs. |
| // Outside of this limited range, two different bad things will happen. |
| // First, because we are not using logarithms, the values of intermediate |
| // quantities will exceed the dynamic range of doubles. Second, even if that |
| // problem were fixed, the running time of this procedure is essentially linear |
| // in est = (numSamples / p), and that can be Very, Very Big. |
| |
| private static long specialNStar(final long numSamplesI, final double p, final double delta) { |
| final double q, numSamplesF; |
| double tot, curTerm; |
| long m; |
| assertTrue(numSamplesI >= 1); |
| assertTrue((0.0 < p) && (p < 1.0)); |
| assertTrue((0.0 < delta) && (delta < 1.0)); |
| q = 1.0 - p; |
| numSamplesF = numSamplesI; |
| // Use a different algorithm if the following isn't true; this one will be too slow, or worse. |
| assertTrue((numSamplesF / p) < 500.0); |
| curTerm = Math.pow(p, numSamplesF); // curTerm = posteriorProbability (k, k, p) |
| assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms |
| tot = curTerm; |
| m = numSamplesI; |
| while (tot <= delta) { // this test can fail even the first time |
| curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI); |
| tot += curTerm; |
| m += 1; |
| } |
| // we have reached a state where tot > delta, so back up one |
| return (m - 1); |
| } |
| |
| // The following procedure has very limited applicability. |
| // The above remarks about specialNStar() also apply here. |
| private static long specialNPrimeB(final long numSamplesI, final double p, final double delta) { |
| final double q, numSamplesF, oneMinusDelta; |
| double tot, curTerm; |
| long m; |
| assertTrue(numSamplesI >= 1); |
| assertTrue((0.0 < p) && (p < 1.0)); |
| assertTrue((0.0 < delta) && (delta < 1.0)); |
| q = 1.0 - p; |
| oneMinusDelta = 1.0 - delta; |
| numSamplesF = numSamplesI; |
| curTerm = Math.pow(p, numSamplesF); // curTerm = posteriorProbability (k, k, p) |
| assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms |
| tot = curTerm; |
| m = numSamplesI; |
| while (tot < oneMinusDelta) { |
| curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI); |
| tot += curTerm; |
| m += 1; |
| } |
| return (m); // don't need to back up |
| } |
| |
| private static long specialNPrimeF(final long numSamplesI, final double p, final double delta) { |
| // Use a different algorithm if the following isn't true; this one will be too slow, or worse. |
| assertTrue(((numSamplesI) / p) < 500.0); //A super-small delta could also make it slow. |
| return (specialNPrimeB(numSamplesI + 1, p, delta)); |
| } |
| |
| // The following computes an approximation to the lower bound of |
| // a Frequentist confidence interval based on the tails of the Binomial distribution. |
| private static double computeApproxBinoLB(final long numSamplesI, final double theta, |
| final int numSDev) { |
| if (theta == 1.0) { |
| return (numSamplesI); |
| } |
| |
| else if (numSamplesI == 0) { |
| return (0.0); |
| } |
| |
| else if (numSamplesI == 1) { |
| final double delta = deltaOfNumSDev[numSDev]; |
| final double rawLB = (Math.log(1.0 - delta)) / (Math.log(1.0 - theta)); |
| return (Math.floor(rawLB)); // round down |
| } |
| |
| else if (numSamplesI > 120) { |
| // plenty of samples, so gaussian approximation to binomial distribution isn't too bad |
| final double rawLB = contClassicLB( numSamplesI, theta, numSDev); |
| return (rawLB - 0.5); // fake round down |
| } |
| |
| // at this point we know 2 <= numSamplesI <= 120 |
| |
| else if (theta > (1.0 - 1e-5)) { // empirically-determined threshold |
| return (numSamplesI); |
| } |
| |
| else if (theta < ((numSamplesI) / 360.0)) { // empirically-determined threshold |
| // here we use the gaussian approximation, but with a modified "numSDev" |
| final int index; |
| final double rawLB; |
| index = (3 * ((int) numSamplesI)) + (numSDev - 1); |
| rawLB = contClassicLB(numSamplesI, theta, EquivTables.getLB(index)); |
| return (rawLB - 0.5); // fake round down |
| } |
| |
| else { // This is the most difficult range to approximate; we will compute an "exact" LB. |
| // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow. |
| final double delta = deltaOfNumSDev[numSDev]; |
| final long nstar = specialNStar(numSamplesI, theta, delta); |
| return (nstar); // don't need to round |
| } |
| } |
| |
| // The following computes an approximation to the upper bound of |
| // a Frequentist confidence interval based on the tails of the Binomial distribution. |
| private static double computeApproxBinoUB(final long numSamplesI, final double theta, |
| final int numSDev) { |
| if (theta == 1.0) { |
| return (numSamplesI); |
| } |
| |
| else if (numSamplesI == 0) { |
| final double delta = deltaOfNumSDev[numSDev]; |
| final double rawUB = (Math.log(delta)) / (Math.log(1.0 - theta)); |
| return (Math.ceil(rawUB)); // round up |
| } |
| |
| else if (numSamplesI > 120) { |
| // plenty of samples, so gaussian approximation to binomial distribution isn't too bad |
| final double rawUB = contClassicUB(numSamplesI, theta, numSDev); |
| return (rawUB + 0.5); // fake round up |
| } |
| |
| // at this point we know 1 <= numSamplesI <= 120 |
| |
| else if (theta > (1.0 - 1e-5)) { // empirically-determined threshold |
| return (numSamplesI + 1); |
| } |
| |
| else if (theta < ((numSamplesI) / 360.0)) { // empirically-determined threshold |
| // here we use the gaussian approximation, but with a modified "numSDev" |
| final int index; |
| final double rawUB; |
| index = (3 * ((int) numSamplesI)) + (numSDev - 1); |
| rawUB = contClassicUB(numSamplesI, theta, EquivTables.getUB(index)); |
| return (rawUB + 0.5); // fake round up |
| } |
| |
| else { // This is the most difficult range to approximate; we will compute an "exact" UB. |
| // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow. |
| final double delta = deltaOfNumSDev[numSDev]; |
| final long nprimef = specialNPrimeF(numSamplesI, theta, delta); |
| return (nprimef); // don't need to round |
| } |
| } |
| |
| // The following two procedures enforce some extra rules that help |
| // to prevent the return of bounds that might be confusing to users. |
| /** |
| * Returns the approximate lower bound value |
| * @param numSamples the number of samples in the sample set |
| * @param theta the sampling probability |
| * @param numSDev the number of "standard deviations" from the mean for the tail bounds. |
| * This must be an integer value of 1, 2 or 3. |
| * @param noDataSeen this is normally false. However, in the case where you have zero samples |
| * and a theta < 1.0, this flag enables the distinction between a virgin case when no actual |
| * data has been seen and the case where the estimate may be zero but an upper error bound may |
| * still exist. |
| * @return the approximate upper bound value |
| */ |
| public static double getLowerBound(final long numSamples, final double theta, final int numSDev, |
| final boolean noDataSeen) { |
| //in earlier code numSamples was called numSamplesI |
| if (noDataSeen) { return 0.0; } |
| checkArgs(numSamples, theta, numSDev); |
| final double lb = computeApproxBinoLB(numSamples, theta, numSDev); |
| final double numSamplesF = numSamples; |
| final double est = numSamplesF / theta; |
| return (Math.min(est, Math.max(numSamplesF, lb))); |
| } |
| |
| /** |
| * Returns the approximate upper bound value |
| * @param numSamples the number of samples in the sample set |
| * @param theta the sampling probability |
| * @param numSDev the number of "standard deviations" from the mean for the tail bounds. |
| * This must be an integer value of 1, 2 or 3. |
| * @param noDataSeen this is normally false. However, in the case where you have zero samples |
| * and a theta < 1.0, this flag enables the distinction between a virgin case when no actual |
| * data has been seen and the case where the estimate may be zero but an upper error bound may |
| * still exist. |
| * @return the approximate upper bound value |
| */ |
| public static double getUpperBound(final long numSamples, final double theta, final int numSDev, |
| final boolean noDataSeen) { |
| //in earlier code numSamples was called numSamplesI |
| if (noDataSeen) { return 0.0; } |
| checkArgs(numSamples, theta, numSDev); |
| final double ub = computeApproxBinoUB(numSamples, theta, numSDev); |
| final double numSamplesF = numSamples; |
| final double est = numSamplesF / theta; |
| return (Math.max(est, ub)); |
| } |
| |
| //exposed only for test |
| static final void checkArgs(final long numSamples, final double theta, final int numSDev) { |
| if ((numSDev | (numSDev - 1) | (3 - numSDev) | numSamples) < 0) { |
| throw new SketchesArgumentException( |
| "numSDev must only be 1,2, or 3 and numSamples must >= 0: numSDev=" |
| + numSDev + ", numSamples=" + numSamples); |
| } |
| if ((theta < 0.0) || (theta > 1.0)) { |
| throw new SketchesArgumentException("0.0 < theta <= 1.0: " + theta); |
| } |
| } |
| |
| private static void assertTrue(final boolean truth) { |
| assert (truth); |
| } |
| |
| } // end of class "BinomialBoundsN" |