MATH-1616: Refactor "EmpiricalDistribution".
* No default bin count (cf. MATH-1462).
* No data loading from external sources (file, URL).
* No data abstraction layer.
* Return defensive copies of the internal state.
* Make class immutable.
* Allow user-defined within-bin kernel.
diff --git a/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java b/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java
index 65e5808..3fd8bbf 100644
--- a/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java
+++ b/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistribution.java
@@ -17,29 +17,16 @@
package org.apache.commons.math4.legacy.distribution;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
+import java.util.function.Function;
import org.apache.commons.statistics.distribution.NormalDistribution;
import org.apache.commons.statistics.distribution.ContinuousDistribution;
import org.apache.commons.statistics.distribution.ConstantContinuousDistribution;
import org.apache.commons.numbers.core.Precision;
-import org.apache.commons.rng.UniformRandomProvider;
-import org.apache.commons.math4.legacy.exception.MathIllegalStateException;
-import org.apache.commons.math4.legacy.exception.MathInternalError;
-import org.apache.commons.math4.legacy.exception.NullArgumentException;
import org.apache.commons.math4.legacy.exception.OutOfRangeException;
-import org.apache.commons.math4.legacy.exception.ZeroException;
import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
-import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
import org.apache.commons.math4.legacy.stat.descriptive.StatisticalSummary;
import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath;
@@ -99,290 +86,138 @@
*
* <strong>USAGE NOTES:</strong>
* <ul>
- * <li>The {@code binCount} is set by default to 1000. A good rule of thumb
- * is to set the bin count to approximately the length of the input file divided
- * by 10. </li>
- * <li>The input file <i>must</i> be a plain text file containing one valid numeric
- * entry per line.</li>
+ * <li>
+ * The {@code binCount} is set by default to 1000. A good rule of thumb
+ * is to set the bin count to approximately the length of the input file
+ * divided by 10. </li>
+ * <li>
+ * The input file <i>must</i> be a plain text file containing one valid
+ * numeric entry per line.</li>
* </ul>
*/
-public class EmpiricalDistribution extends AbstractRealDistribution
+public final class EmpiricalDistribution extends AbstractRealDistribution
implements ContinuousDistribution {
- /** Default bin count. */
- public static final int DEFAULT_BIN_COUNT = 1000;
- /** Character set for file input. */
- private static final String FILE_CHARSET = "US-ASCII";
- /** Bins' characteristics. */
+ /** Bins characteristics. */
private final List<SummaryStatistics> binStats;
/** Sample statistics. */
- private SummaryStatistics sampleStats;
+ private final SummaryStatistics sampleStats;
/** Max loaded value. */
- private double max = Double.NEGATIVE_INFINITY;
+ private final double max;
/** Min loaded value. */
- private double min = Double.POSITIVE_INFINITY;
+ private final double min;
/** Grid size. */
- private double delta;
+ private final double delta;
/** Number of bins. */
private final int binCount;
- /** Whether the distribution is loaded. */
- private boolean loaded;
- /** Upper bounds of subintervals in (0,1) belonging to the bins. */
- private double[] upperBounds;
+ /** Upper bounds of subintervals in (0, 1) belonging to the bins. */
+ private final double[] upperBounds;
+ /** Kernel factory. */
+ private final Function<SummaryStatistics, ContinuousDistribution> kernelFactory;
/**
- * Creates a new EmpiricalDistribution with the default bin count.
- */
- public EmpiricalDistribution() {
- this(DEFAULT_BIN_COUNT);
- }
-
- /**
- * Creates a new EmpiricalDistribution with the specified bin count.
+ * Creates a new instance with the specified data.
*
- * @param binCount number of bins. Must be strictly positive.
+ * @param binCount Number of bins. Must be strictly positive.
+ * @param input Input data. Cannot be {@code null}.
+ * @param kernelFactory Kernel factory.
* @throws NotStrictlyPositiveException if {@code binCount <= 0}.
*/
- public EmpiricalDistribution(int binCount) {
+ private EmpiricalDistribution(int binCount,
+ double[] input,
+ Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
if (binCount <= 0) {
throw new NotStrictlyPositiveException(binCount);
}
this.binCount = binCount;
- binStats = new ArrayList<>();
- }
- /**
- * Computes the empirical distribution from the provided
- * array of numbers.
- *
- * @param in the input data array
- * @exception NullArgumentException if in is null
- */
- public void load(double[] in) {
- DataAdapter da = new ArrayDataAdapter(in);
- try {
- da.computeStats();
- // new adapter for the second pass
- fillBinStats(new ArrayDataAdapter(in));
- } catch (IOException ex) {
- // Can't happen
- throw new MathInternalError();
- }
- loaded = true;
-
- }
-
- /**
- * Computes the empirical distribution using data read from a URL.
- *
- * <p>The input file <i>must</i> be an ASCII text file containing one
- * valid numeric entry per line.</p>
- *
- * @param url url of the input file
- *
- * @throws IOException if an IO error occurs
- * @throws NullArgumentException if url is null
- * @throws ZeroException if URL contains no data
- */
- public void load(URL url) throws IOException {
- NullArgumentException.check(url);
- Charset charset = Charset.forName(FILE_CHARSET);
- BufferedReader in =
- new BufferedReader(new InputStreamReader(url.openStream(), charset));
- try {
- DataAdapter da = new StreamDataAdapter(in);
- da.computeStats();
- if (sampleStats.getN() == 0) {
- throw new ZeroException(LocalizedFormats.URL_CONTAINS_NO_DATA, url);
- }
- // new adapter for the second pass
- in = new BufferedReader(new InputStreamReader(url.openStream(), charset));
- fillBinStats(new StreamDataAdapter(in));
- loaded = true;
- } finally {
- try {
- in.close();
- } catch (IOException ex) { //NOPMD
- // ignore
- }
- }
- }
-
- /**
- * Computes the empirical distribution from the input file.
- *
- * <p>The input file <i>must</i> be an ASCII text file containing one
- * valid numeric entry per line.</p>
- *
- * @param file the input file
- * @throws IOException if an IO error occurs
- * @throws NullArgumentException if file is null
- */
- public void load(File file) throws IOException {
- NullArgumentException.check(file);
- Charset charset = Charset.forName(FILE_CHARSET);
- InputStream is = new FileInputStream(file);
- BufferedReader in = new BufferedReader(new InputStreamReader(is, charset));
- try {
- DataAdapter da = new StreamDataAdapter(in);
- da.computeStats();
- // new adapter for second pass
- is = new FileInputStream(file);
- in = new BufferedReader(new InputStreamReader(is, charset));
- fillBinStats(new StreamDataAdapter(in));
- loaded = true;
- } finally {
- try {
- in.close();
- } catch (IOException ex) { //NOPMD
- // ignore
- }
- }
- }
-
- /**
- * Provides methods for computing {@code sampleStats} and
- * {@code beanStats} abstracting the source of data.
- */
- private abstract class DataAdapter {
- /**
- * Compute bin stats.
- *
- * @throws IOException if an error occurs computing bin stats
- */
- public abstract void computeBinStats() throws IOException;
-
- /**
- * Compute sample statistics.
- *
- * @throws IOException if an error occurs computing sample stats
- */
- public abstract void computeStats() throws IOException;
- }
-
- /**
- * {@code DataAdapter} for data provided through some input stream.
- */
- private class StreamDataAdapter extends DataAdapter {
- /** Input stream providing access to the data. */
- private final BufferedReader inputStream;
-
- /**
- * Create a StreamDataAdapter from a BufferedReader.
- *
- * @param in BufferedReader input stream
- */
- StreamDataAdapter(BufferedReader in){
- inputStream = in;
+ // First pass through the data.
+ sampleStats = new SummaryStatistics();
+ for (int i = 0; i < input.length; i++) {
+ sampleStats.addValue(input[i]);
}
- /** {@inheritDoc} */
- @Override
- public void computeBinStats() throws IOException {
- String str = null;
- double val = 0.0d;
- while ((str = inputStream.readLine()) != null) {
- val = Double.parseDouble(str);
- SummaryStatistics stats = binStats.get(findBin(val));
- stats.addValue(val);
- }
-
- inputStream.close();
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeStats() throws IOException {
- String str = null;
- double val = 0.0;
- sampleStats = new SummaryStatistics();
- while ((str = inputStream.readLine()) != null) {
- val = Double.parseDouble(str);
- sampleStats.addValue(val);
- }
- inputStream.close();
- }
- }
-
- /**
- * {@code DataAdapter} for data provided as array of doubles.
- */
- private class ArrayDataAdapter extends DataAdapter {
- /** Array of input data values. */
- private final double[] inputArray;
-
- /**
- * Construct an ArrayDataAdapter from a double[] array.
- *
- * @param in double[] array holding the data
- * @throws NullArgumentException if in is null
- */
- ArrayDataAdapter(double[] in) {
- NullArgumentException.check(in);
- inputArray = in;
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeStats() throws IOException {
- sampleStats = new SummaryStatistics();
- for (int i = 0; i < inputArray.length; i++) {
- sampleStats.addValue(inputArray[i]);
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public void computeBinStats() throws IOException {
- for (int i = 0; i < inputArray.length; i++) {
- SummaryStatistics stats = binStats.get(findBin(inputArray[i]));
- stats.addValue(inputArray[i]);
- }
- }
- }
-
- /**
- * Fills binStats array (second pass through data file).
- *
- * @param da object providing access to the data
- * @throws IOException if an IO error occurs
- */
- private void fillBinStats(final DataAdapter da) throws IOException {
- // Set up grid
+ // Set up grid.
min = sampleStats.getMin();
max = sampleStats.getMax();
delta = (max - min) / binCount;
- // Initialize binStats ArrayList
- if (!binStats.isEmpty()) {
- binStats.clear();
- }
- for (int i = 0; i < binCount; i++) {
- SummaryStatistics stats = new SummaryStatistics();
- binStats.add(i, stats);
- }
+ // Second pass through the data.
+ binStats = createBinStats(input);
- // Filling data in binStats Array
- da.computeBinStats();
-
- // Assign upperBounds based on bin counts
+ // Assign upper bounds based on bin counts.
upperBounds = new double[binCount];
- upperBounds[0] = binStats.get(0).getN() / (double) sampleStats.getN();
+ final double n = (double) sampleStats.getN();
+ upperBounds[0] = binStats.get(0).getN() / n;
for (int i = 1; i < binCount - 1; i++) {
- upperBounds[i] = upperBounds[i - 1] +
- binStats.get(i).getN() / (double) sampleStats.getN();
+ upperBounds[i] = upperBounds[i - 1] + binStats.get(i).getN() / n;
}
upperBounds[binCount - 1] = 1d;
+
+ this.kernelFactory = kernelFactory;
+ }
+
+ /**
+ * Factory that creates a new instance from the specified data.
+ *
+ * @param binCount Number of bins. Must be strictly positive.
+ * @param input Input data. Cannot be {@code null}.
+ * @param kernelFactory Factory for creating within-bin kernels.
+ * @return a new instance.
+ * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
+ */
+ public static EmpiricalDistribution from(int binCount,
+ double[] input,
+ Function<SummaryStatistics, ContinuousDistribution> kernelFactory) {
+ return new EmpiricalDistribution(binCount,
+ input,
+ kernelFactory);
+ }
+
+ /**
+ * Factory that creates a new instance from the specified data.
+ *
+ * @param binCount Number of bins. Must be strictly positive.
+ * @param input Input data. Cannot be {@code null}.
+ * @return a new instance.
+ * @throws NotStrictlyPositiveException if {@code binCount <= 0}.
+ */
+ public static EmpiricalDistribution from(int binCount,
+ double[] input) {
+ return from(binCount, input, defaultKernel());
+ }
+
+ /**
+ * Create statistics (second pass through the data).
+ *
+ * @param input Input data.
+ * @return bins statistics.
+ */
+ private List<SummaryStatistics> createBinStats(double[] input) {
+ final List<SummaryStatistics> binStats = new ArrayList<>();
+
+ for (int i = 0; i < binCount; i++) {
+ binStats.add(i, new SummaryStatistics());
+ }
+
+ // Second pass though the data.
+ for (int i = 0; i < input.length; i++) {
+ final double v = input[i];
+ binStats.get(findBin(v)).addValue(v);
+ }
+
+ return binStats;
}
/**
* Returns the index of the bin to which the given value belongs.
*
- * @param value the value whose bin we are trying to find
- * @return the index of the bin containing the value
+ * @param value Value whose bin we are trying to find.
+ * @return the index of the bin containing the value.
*/
private int findBin(double value) {
- return AccurateMath.min(AccurateMath.max((int) AccurateMath.ceil((value - min) / delta) - 1, 0),
- binCount - 1);
+ return Math.min(Math.max((int) AccurateMath.ceil((value - min) / delta) - 1,
+ 0),
+ binCount - 1);
}
/**
@@ -394,7 +229,7 @@
* @throws IllegalStateException if the distribution has not been loaded
*/
public StatisticalSummary getSampleStats() {
- return sampleStats;
+ return sampleStats.copy();
}
/**
@@ -407,27 +242,33 @@
}
/**
- * Returns a List of {@link SummaryStatistics} instances containing
- * statistics describing the values in each of the bins. The list is
- * indexed on the bin number.
+ * Returns a copy of the {@link SummaryStatistics} instances containing
+ * statistics describing the values in each of the bins.
+ * The list is indexed on the bin number.
*
- * @return List of bin statistics.
+ * @return the bins statistics.
*/
public List<SummaryStatistics> getBinStats() {
- return binStats;
+ final List<SummaryStatistics> copy = new ArrayList<>();
+ for (SummaryStatistics s : binStats) {
+ copy.add(s.copy());
+ }
+ return copy;
}
/**
- * <p>Returns a fresh copy of the array of upper bounds for the bins.
- * Bins are: <br>
- * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
- * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
+ * Returns the upper bounds of the bins.
*
- * <p>Note: In versions 1.0-2.0 of commons-math, this method
- * incorrectly returned the array of probability generator upper
- * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
+ * Assuming array {@code u} is returned by this method, the bins are:
+ * <ul>
+ * <li>{@code (min, u[0])},</li>
+ * <li>{@code (u[0], u[1])},</li>
+ * <li>... ,</li>
+ * <li>{@code (u[binCount - 2], u[binCount - 1] = max)},</li>
+ * </ul>
*
- * @return array of bin upper bounds
+ * @return the bins upper bounds.
+ *
* @since 2.1
*/
public double[] getUpperBounds() {
@@ -440,20 +281,18 @@
}
/**
- * <p>Returns a fresh copy of the array of upper bounds of the subintervals
- * of [0,1] used in generating data from the empirical distribution.
- * Subintervals correspond to bins with lengths proportional to bin counts.</p>
+ * Returns the upper bounds of the subintervals of [0, 1] used in generating
+ * data from the empirical distribution.
+ * Subintervals correspond to bins with lengths proportional to bin counts.
*
* <strong>Preconditions:</strong><ul>
* <li>the distribution must be loaded before invoking this method</li></ul>
*
- * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
- * by {@link #getUpperBounds()}.</p>
- *
- * @since 2.1
* @return array of upper bounds of subintervals used in data generation
* @throws NullPointerException unless a {@code load} method has been
* called beforehand.
+ *
+ * @since 2.1
*/
public double[] getGeneratorUpperBounds() {
int len = upperBounds.length;
@@ -462,15 +301,6 @@
return out;
}
- /**
- * Property indicating whether or not the distribution has been loaded.
- *
- * @return true if the distribution has been loaded
- */
- public boolean isLoaded() {
- return loaded;
- }
-
// Distribution methods.
/**
@@ -485,15 +315,18 @@
/**
* {@inheritDoc}
*
- * <p>Returns the kernel density normalized so that its integral over each bin
- * equals the bin mass.</p>
+ * Returns the kernel density normalized so that its integral over each bin
+ * equals the bin mass.
*
- * <p>Algorithm description: <ol>
- * <li>Find the bin B that x belongs to.</li>
- * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
- * integral of the kernel density over B).</li>
- * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
- * and P(B) is the mass of B.</li></ol>
+ * Algorithm description:
+ * <ol>
+ * <li>Find the bin B that x belongs to.</li>
+ * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
+ * integral of the kernel density over B).</li>
+ * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
+ * and P(B) is the mass of B.</li>
+ * </ol>
+ *
* @since 3.1
*/
@Override
@@ -509,13 +342,15 @@
/**
* {@inheritDoc}
*
- * <p>Algorithm description:<ol>
- * <li>Find the bin B that x belongs to.</li>
- * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
- * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
- * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
- * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
- * K(x) is the within-bin kernel distribution function evaluated at x.</li></ol>
+ * Algorithm description:
+ * <ol>
+ * <li>Find the bin B that x belongs to.</li>
+ * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
+ * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
+ * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
+ * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
+ * K(x) is the within-bin kernel distribution function evaluated at x.</li>
+ * </ol>
* If K is a constant distribution, we return P(B-) + P(B) (counting the full
* mass of B).
*
@@ -550,20 +385,24 @@
/**
* {@inheritDoc}
*
- * <p>Algorithm description:<ol>
- * <li>Find the smallest i such that the sum of the masses of the bins
- * through i is at least p.</li>
- * <li>
- * Let K be the within-bin kernel distribution for bin i.<br>
- * Let K(B) be the mass of B under K. <br>
- * Let K(B-) be K evaluated at the lower endpoint of B (the combined
- * mass of the bins below B under K).<br>
- * Let P(B) be the probability of bin i.<br>
- * Let P(B-) be the sum of the bin masses below bin i. <br>
- * Let pCrit = p - P(B-)<br>
- * <li>Return the inverse of K evaluated at <br>
+ * Algorithm description:
+ * <ol>
+ * <li>Find the smallest i such that the sum of the masses of the bins
+ * through i is at least p.</li>
+ * <li>
+ * <ol>
+ * <li>Let K be the within-bin kernel distribution for bin i.</li>
+ * <li>Let K(B) be the mass of B under K.</li>
+ * <li>Let K(B-) be K evaluated at the lower endpoint of B (the combined
+ * mass of the bins below B under K).</li>
+ * <li>Let P(B) be the probability of bin i.</li>
+ * <li>Let P(B-) be the sum of the bin masses below bin i.</li>
+ * <li>Let pCrit = p - P(B-)</li>
+ * </ol>
+ * </li>
+ * <li>Return the inverse of K evaluated at
* K(B-) + pCrit * K(B) / P(B) </li>
- * </ol>
+ * </ol>
*
* @since 3.1
*/
@@ -651,15 +490,6 @@
return true;
}
- /**{@inheritDoc} */
- @Override
- public ContinuousDistribution.Sampler createSampler(final UniformRandomProvider rng) {
- if (!loaded) {
- throw new MathIllegalStateException(LocalizedFormats.DISTRIBUTION_NOT_LOADED);
- }
- return super.createSampler(rng);
- }
-
/**
* The probability of bin i.
*
@@ -717,19 +547,29 @@
}
/**
- * The within-bin smoothing kernel. Returns a Gaussian distribution
- * parameterized by {@code bStats}, unless the bin contains only one
- * observation, in which case a constant distribution is returned.
- *
- * @param bStats summary statistics for the bin
- * @return within-bin kernel parameterized by bStats
+ * @param stats Bin statistics.
+ * @return the within-bin kernel.
*/
- protected ContinuousDistribution getKernel(SummaryStatistics bStats) {
- if (bStats.getN() <= 1 ||
- bStats.getVariance() == 0) {
- return new ConstantContinuousDistribution(bStats.getMean());
- } else {
- return new NormalDistribution(bStats.getMean(), bStats.getStandardDeviation());
- }
+ private ContinuousDistribution getKernel(SummaryStatistics stats) {
+ return kernelFactory.apply(stats);
+ }
+
+ /**
+ * The within-bin smoothing kernel: A Gaussian distribution
+ * (unless the bin contains only one observation, in which case
+ * a constant distribution is returned).
+ *
+ * @return the within-bin kernel factory.
+ */
+ private static Function<SummaryStatistics, ContinuousDistribution> defaultKernel() {
+ return stats -> {
+ if (stats.getN() <= 1 ||
+ stats.getVariance() == 0) {
+ return new ConstantContinuousDistribution(stats.getMean());
+ } else {
+ return new NormalDistribution(stats.getMean(),
+ stats.getStandardDeviation());
+ }
+ };
}
}
diff --git a/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java b/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java
index b86b7c3..3217317 100644
--- a/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java
+++ b/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/distribution/EmpiricalDistributionTest.java
@@ -17,7 +17,6 @@
package org.apache.commons.math4.legacy.distribution;
import java.io.BufferedReader;
-import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
@@ -35,8 +34,6 @@
import org.apache.commons.math4.legacy.analysis.UnivariateFunction;
import org.apache.commons.math4.legacy.analysis.integration.BaseAbstractUnivariateIntegrator;
import org.apache.commons.math4.legacy.analysis.integration.IterativeLegendreGaussIntegrator;
-import org.apache.commons.math4.legacy.exception.MathIllegalStateException;
-import org.apache.commons.math4.legacy.exception.NullArgumentException;
import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics;
import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath;
@@ -48,32 +45,28 @@
* Test cases for the {@link EmpiricalDistribution} class.
*/
public final class EmpiricalDistributionTest extends RealDistributionAbstractTest {
-
- protected EmpiricalDistribution empiricalDistribution = null;
- protected EmpiricalDistribution empiricalDistribution2 = null;
- protected File file = null;
- protected URL url = null;
- protected double[] dataArray = null;
- protected final int n = 10000;
+ private EmpiricalDistribution empiricalDistribution = null;
+ private double[] dataArray = null;
+ private final int n = 10000;
+ /** Uniform bin mass = 10/10001 == mass of all but the first bin */
+ private final double binMass = 10d / (n + 1);
+ /** Mass of first bin = 11/10001 */
+ private final double firstBinMass = 11d / (n + 1);
@Override
@Before
public void setUp() {
super.setUp();
- empiricalDistribution = new EmpiricalDistribution(100);
- url = getClass().getResource("testData.txt");
+
+ final URL url = getClass().getResource("testData.txt");
final ArrayList<Double> list = new ArrayList<>();
try {
- empiricalDistribution2 = new EmpiricalDistribution(100);
- BufferedReader in =
- new BufferedReader(new InputStreamReader(
- url.openStream()));
+ final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String str = null;
while ((str = in.readLine()) != null) {
list.add(Double.valueOf(str));
}
in.close();
- in = null;
} catch (IOException ex) {
Assert.fail("IOException " + ex);
}
@@ -84,193 +77,68 @@
dataArray[i] = data.doubleValue();
i++;
}
+
+ empiricalDistribution = EmpiricalDistribution.from(100, dataArray);
}
// MATH-1279
@Test(expected=NotStrictlyPositiveException.class)
public void testPrecondition1() {
- new EmpiricalDistribution(0);
+ EmpiricalDistribution.from(0, new double[] {1,2,3});
}
/**
- * Test EmpiricalDistribution.load() using sample data file.<br>
- * Check that the sampleCount, mu and sigma match data in
- * the sample data file. Also verify that load is idempotent.
+ * Test using data taken from sample data file.
+ * Check that the sampleCount, mu and sigma match data in the sample data file.
*/
@Test
- public void testLoad() throws Exception {
- // Load from a URL
- empiricalDistribution.load(url);
- checkDistribution();
-
- // Load again from a file (also verifies idempotency of load)
- File file = new File(url.toURI());
- empiricalDistribution.load(file);
- checkDistribution();
- }
-
- private void checkDistribution() {
+ public void testDoubleLoad() {
// testData File has 10000 values, with mean ~ 5.0, std dev ~ 1
// Make sure that loaded distribution matches this
- Assert.assertEquals(empiricalDistribution.getSampleStats().getN(),1000,10E-7);
+ Assert.assertEquals(empiricalDistribution.getSampleStats().getN(),
+ 1000, 1e-7);
//TODO: replace with statistical tests
Assert.assertEquals(empiricalDistribution.getSampleStats().getMean(),
- 5.069831575018909,10E-7);
+ 5.069831575018909, 1e-7);
Assert.assertEquals(empiricalDistribution.getSampleStats().getStandardDeviation(),
- 1.0173699343977738,10E-7);
- }
+ 1.0173699343977738, 1e-7);
- /**
- * Test EmpiricalDistribution.load(double[]) using data taken from
- * sample data file.<br>
- * Check that the sampleCount, mu and sigma match data in
- * the sample data file.
- */
- @Test
- public void testDoubleLoad() throws Exception {
- empiricalDistribution2.load(dataArray);
- // testData File has 10000 values, with mean ~ 5.0, std dev ~ 1
- // Make sure that loaded distribution matches this
- Assert.assertEquals(empiricalDistribution2.getSampleStats().getN(),1000,10E-7);
- //TODO: replace with statistical tests
- Assert.assertEquals(empiricalDistribution2.getSampleStats().getMean(),
- 5.069831575018909,10E-7);
- Assert.assertEquals(empiricalDistribution2.getSampleStats().getStandardDeviation(),
- 1.0173699343977738,10E-7);
-
- double[] bounds = empiricalDistribution2.getGeneratorUpperBounds();
+ double[] bounds = empiricalDistribution.getGeneratorUpperBounds();
Assert.assertEquals(bounds.length, 100);
Assert.assertEquals(bounds[99], 1.0, 10e-12);
-
}
-
// MATH-1531
@Test
public void testMath1531() {
- final EmpiricalDistribution inputDistribution = new EmpiricalDistribution(120);
- inputDistribution.load(new double[] {
- 50.993456376721454,
- 49.455345691918055,
- 49.527276095295804,
- 50.017183448668845,
- 49.10508147470046,
- 49.813998274118696,
- 50.87195348756139,
- 50.419474110037,
- 50.63614906979689,
- 49.49694777179407,
- 50.71799078406067,
- 50.03192853759164,
- 49.915092423165994,
- 49.56895392597687,
- 51.034638001064934,
- 50.681227971275945,
- 50.43749845081759,
- 49.86513120270245,
- 50.21475262482965,
- 49.99202971042547,
- 50.02382189838519,
- 49.386888585302884,
- 49.45585010202781,
- 49.988009479855435,
- 49.8136712206123,
- 49.6715197127997,
- 50.1981278397565,
- 49.842297508010276,
- 49.62491227740015,
- 50.05101916097176,
- 48.834912763303926,
- 49.806787657848574,
- 49.478236106374695,
- 49.56648347371614,
- 49.95069238081982,
- 49.71845132077346,
- 50.6097468705947,
- 49.80724637775541,
- 49.90448813086025,
- 49.39641861662603,
- 50.434295712893714,
- 49.227176959566734,
- 49.541126466050905,
- 49.03416593170446,
- 49.11584328494423,
- 49.61387482435674,
- 49.92877857995328,
- 50.70638552955101,
- 50.60078208448842,
- 49.39326233277838,
- 49.21488424364095,
- 49.69503351015096,
- 50.13733214001718,
- 50.22084761458942,
- 51.09804435604931,
- 49.18559131120419,
- 49.52286371605357,
- 49.34804374996689,
- 49.6901827776375,
- 50.01316351359638,
- 48.7751460520373,
- 50.12961836291053,
- 49.9978419772511,
- 49.885658399408584,
- 49.673438879979834,
- 49.45565980965606,
- 50.429747484906564,
- 49.40129274804164,
- 50.13034614008073,
- 49.87685735146651,
- 50.12967905393557,
- 50.323560376181696,
- 49.83519233651367,
- 49.37333369733053,
- 49.70074301611427,
- 50.11626105774947,
- 50.28249500380083,
- 50.543354367136466,
- 50.05866241335002,
- 50.39516515672527,
- 49.4838561463057,
- 50.451757089234796,
- 50.31370674203726,
- 49.79063762614284,
- 50.19652349768548,
- 49.75881420748814,
- 49.98371855036422,
- 49.82171344472916,
- 48.810793204162415,
- 49.37040569084592,
- 50.050641186203976,
- 50.48360952263646,
- 50.86666450358076,
- 50.463268776129844,
- 50.137489751888666,
- 50.23823061444118,
- 49.881460479468004,
- 50.641174398764356,
- 49.09314136851421,
- 48.80877928574451,
- 50.46197084844826,
- 49.97691704141741,
- 49.99933997561926,
- 50.25692254481885,
- 49.52973451252715,
- 49.81229858420664,
- 48.996112655915994,
- 48.740531054814674,
- 50.026642633066416,
- 49.98696633604899,
- 49.61307159972952,
- 50.5115278979726,
- 50.75245152442404,
- 50.51807785445929,
- 49.60929671768147,
- 49.1079533564074,
- 49.65347196551866,
- 49.31684818724059,
- 50.4906368627049,
- 50.37483603684714});
- inputDistribution.inverseCumulativeProbability(0.7166666666666669);
+ final double[] data = new double[] {
+ 50.993456376721454, 49.455345691918055, 49.527276095295804, 50.017183448668845, 49.10508147470046,
+ 49.813998274118696, 50.87195348756139, 50.419474110037, 50.63614906979689, 49.49694777179407,
+ 50.71799078406067, 50.03192853759164, 49.915092423165994, 49.56895392597687, 51.034638001064934,
+ 50.681227971275945, 50.43749845081759, 49.86513120270245, 50.21475262482965, 49.99202971042547,
+ 50.02382189838519, 49.386888585302884, 49.45585010202781, 49.988009479855435, 49.8136712206123,
+ 49.6715197127997, 50.1981278397565, 49.842297508010276, 49.62491227740015, 50.05101916097176,
+ 48.834912763303926, 49.806787657848574, 49.478236106374695, 49.56648347371614, 49.95069238081982,
+ 49.71845132077346, 50.6097468705947, 49.80724637775541, 49.90448813086025, 49.39641861662603,
+ 50.434295712893714, 49.227176959566734, 49.541126466050905, 49.03416593170446, 49.11584328494423,
+ 49.61387482435674, 49.92877857995328, 50.70638552955101, 50.60078208448842, 49.39326233277838,
+ 49.21488424364095, 49.69503351015096, 50.13733214001718, 50.22084761458942, 51.09804435604931,
+ 49.18559131120419, 49.52286371605357, 49.34804374996689, 49.6901827776375, 50.01316351359638,
+ 48.7751460520373, 50.12961836291053, 49.9978419772511, 49.885658399408584, 49.673438879979834,
+ 49.45565980965606, 50.429747484906564, 49.40129274804164, 50.13034614008073, 49.87685735146651,
+ 50.12967905393557, 50.323560376181696, 49.83519233651367, 49.37333369733053, 49.70074301611427,
+ 50.11626105774947, 50.28249500380083, 50.543354367136466, 50.05866241335002, 50.39516515672527,
+ 49.4838561463057, 50.451757089234796, 50.31370674203726, 49.79063762614284, 50.19652349768548,
+ 49.75881420748814, 49.98371855036422, 49.82171344472916, 48.810793204162415, 49.37040569084592,
+ 50.050641186203976, 50.48360952263646, 50.86666450358076, 50.463268776129844, 50.137489751888666,
+ 50.23823061444118, 49.881460479468004, 50.641174398764356, 49.09314136851421, 48.80877928574451,
+ 50.46197084844826, 49.97691704141741, 49.99933997561926, 50.25692254481885, 49.52973451252715,
+ 49.81229858420664, 48.996112655915994, 48.740531054814674, 50.026642633066416, 49.98696633604899,
+ 49.61307159972952, 50.5115278979726, 50.75245152442404, 50.51807785445929, 49.60929671768147,
+ 49.1079533564074, 49.65347196551866, 49.31684818724059, 50.4906368627049, 50.37483603684714
+ };
+
+ EmpiricalDistribution.from(120, data).inverseCumulativeProbability(0.7166666666666669);
}
/**
@@ -279,84 +147,42 @@
* these tests will fail even if the code is working as designed.
*/
@Test
- public void testNext() throws Exception {
- tstGen(0.1);
- tstDoubleGen(0.1);
- }
-
- /**
- * Make sure exception thrown if sampling is attempted
- * before loading empiricalDistribution.
- */
- @Test
- public void testNextFail1() {
- try {
- empiricalDistribution.createSampler(RandomSource.create(RandomSource.JDK)).sample();
- Assert.fail("Expecting MathIllegalStateException");
- } catch (MathIllegalStateException ex) {
- // expected
- }
- }
-
- /**
- * Make sure exception thrown if sampling is attempted
- * before loading empiricalDistribution.
- */
- @Test
- public void testNextFail2() {
- try {
- empiricalDistribution2.createSampler(RandomSource.create(RandomSource.JDK)).sample();
- Assert.fail("Expecting MathIllegalStateException");
- } catch (MathIllegalStateException ex) {
- // expected
- }
+ public void testNext() {
+ tstGen(empiricalDistribution,
+ 0.1);
}
/**
* Make sure we can handle a grid size that is too fine
*/
@Test
- public void testGridTooFine() throws Exception {
- empiricalDistribution = new EmpiricalDistribution(1001);
- tstGen(0.1);
- empiricalDistribution2 = new EmpiricalDistribution(1001);
- tstDoubleGen(0.1);
+ public void testGridTooFine() {
+ tstGen(EmpiricalDistribution.from(1001, dataArray),
+ 0.1);
}
/**
* How about too fat?
*/
@Test
- public void testGridTooFat() throws Exception {
- empiricalDistribution = new EmpiricalDistribution(1);
- tstGen(5); // ridiculous tolerance; but ridiculous grid size
+ public void testGridTooFat() {
+ tstGen(EmpiricalDistribution.from(1, dataArray),
+ 5); // ridiculous tolerance; but ridiculous grid size
// really just checking to make sure we do not bomb
- empiricalDistribution2 = new EmpiricalDistribution(1);
- tstDoubleGen(5);
}
/**
* Test bin index overflow problem (BZ 36450)
*/
@Test
- public void testBinIndexOverflow() throws Exception {
+ public void testBinIndexOverflow() {
double[] x = new double[] {9474.94326071674, 2080107.8865462579};
- new EmpiricalDistribution().load(x);
+ EmpiricalDistribution.from(1000, x);
}
- @Test(expected=NullArgumentException.class)
+ @Test(expected=NullPointerException.class)
public void testLoadNullDoubleArray() {
- new EmpiricalDistribution().load((double[]) null);
- }
-
- @Test(expected=NullArgumentException.class)
- public void testLoadNullURL() throws Exception {
- new EmpiricalDistribution().load((URL) null);
- }
-
- @Test(expected=NullArgumentException.class)
- public void testLoadNullFile() throws Exception {
- new EmpiricalDistribution().load((File) null);
+ EmpiricalDistribution.from(1000, null);
}
/**
@@ -365,8 +191,7 @@
@Test
public void testGetBinUpperBounds() {
double[] testData = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10};
- EmpiricalDistribution dist = new EmpiricalDistribution(5);
- dist.load(testData);
+ EmpiricalDistribution dist = EmpiricalDistribution.from(5, testData);
double[] expectedBinUpperBounds = {2, 4, 6, 8, 10};
double[] expectedGeneratorUpperBounds = {4d/13d, 7d/13d, 9d/13d, 11d/13d, 1};
double tol = 10E-12;
@@ -374,23 +199,22 @@
TestUtils.assertEquals(expectedGeneratorUpperBounds, dist.getGeneratorUpperBounds(), tol);
}
- private void verifySame(EmpiricalDistribution d1, EmpiricalDistribution d2) {
- Assert.assertEquals(d1.isLoaded(), d2.isLoaded());
+ private void verifySame(EmpiricalDistribution d1,
+ EmpiricalDistribution d2) {
Assert.assertEquals(d1.getBinCount(), d2.getBinCount());
Assert.assertEquals(d1.getSampleStats(), d2.getSampleStats());
- if (d1.isLoaded()) {
- for (int i = 0; i < d1.getUpperBounds().length; i++) {
- Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0);
- }
- Assert.assertEquals(d1.getBinStats(), d2.getBinStats());
+
+ for (int i = 0; i < d1.getUpperBounds().length; i++) {
+ Assert.assertEquals(d1.getUpperBounds()[i], d2.getUpperBounds()[i], 0);
}
+ Assert.assertEquals(d1.getBinStats(), d2.getBinStats());
}
- private void tstGen(double tolerance)throws Exception {
- empiricalDistribution.load(url);
- ContinuousDistribution.Sampler sampler
- = empiricalDistribution.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
- SummaryStatistics stats = new SummaryStatistics();
+ private void tstGen(EmpiricalDistribution dist,
+ double tolerance) {
+ final ContinuousDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
+ final SummaryStatistics stats = new SummaryStatistics();
for (int i = 1; i < 1000; i++) {
stats.addValue(sampler.sample());
}
@@ -398,38 +222,19 @@
Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(),tolerance);
}
- private void tstDoubleGen(double tolerance)throws Exception {
- empiricalDistribution2.load(dataArray);
- ContinuousDistribution.Sampler sampler
- = empiricalDistribution2.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
- SummaryStatistics stats = new SummaryStatistics();
- for (int i = 1; i < 1000; i++) {
- stats.addValue(sampler.sample());
- }
- Assert.assertEquals("mean", 5.069831575018909, stats.getMean(), tolerance);
- Assert.assertEquals("std dev", 1.0173699343977738, stats.getStandardDeviation(), tolerance);
- }
-
// Setup for distribution tests
@Override
public ContinuousDistribution makeDistribution() {
- // Create a uniform distribution on [0, 10,000]
+ // Create a uniform distribution on [0, 10,000].
final double[] sourceData = new double[n + 1];
for (int i = 0; i < n + 1; i++) {
sourceData[i] = i;
}
- EmpiricalDistribution dist = new EmpiricalDistribution();
- dist.load(sourceData);
+ EmpiricalDistribution dist = EmpiricalDistribution.from(1000, sourceData);
return dist;
}
- /** Uniform bin mass = 10/10001 == mass of all but the first bin */
- private final double binMass = 10d / (n + 1);
-
- /** Mass of first bin = 11/10001 */
- private final double firstBinMass = 11d / (n + 1);
-
@Override
public double[] makeCumulativeTestPoints() {
final double[] testPoints = new double[] {9, 10, 15, 1000, 5004, 9999};
@@ -529,10 +334,9 @@
for (int i = 51; i < 100; i++) {
data[i] = 1 - 1 / (100 - (double) i + 2);
}
- EmpiricalDistribution dist = new EmpiricalDistribution(10);
- dist.load(data);
+ EmpiricalDistribution dist = EmpiricalDistribution.from(10, data);
ContinuousDistribution.Sampler sampler
- = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
for (int i = 0; i < 1000; i++) {
final double dev = sampler.sample();
Assert.assertTrue(dev < 1);
@@ -546,10 +350,9 @@
@Test
public void testNoBinVariance() {
final double[] data = {0, 0, 1, 1};
- EmpiricalDistribution dist = new EmpiricalDistribution(2);
- dist.load(data);
+ EmpiricalDistribution dist = EmpiricalDistribution.from(2, data);
ContinuousDistribution.Sampler sampler
- = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
for (int i = 0; i < 1000; i++) {
final double dev = sampler.sample();
Assert.assertTrue(dev == 0 || dev == 1);
@@ -588,17 +391,18 @@
@Test
public void testKernelOverrideConstant() {
- final EmpiricalDistribution dist = new ConstantKernelEmpiricalDistribution(5);
- final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d};
- dist.load(data);
- ContinuousDistribution.Sampler sampler
- = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ final double[] data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ final EmpiricalDistribution dist =
+ EmpiricalDistribution.from(5, data,
+ s -> new ConstantContinuousDistribution(s.getMean()));
+ final ContinuousDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
// Bin masses concentrated on 2, 5, 8, 11, 14 <- effectively discrete uniform distribution over these
- double[] values = {2d, 5d, 8d, 11d, 14d};
+ final double[] values = {2d, 5d, 8d, 11d, 14d};
for (int i = 0; i < 20; i++) {
Assert.assertTrue(Arrays.binarySearch(values, sampler.sample()) >= 0);
}
- final double tol = 10E-12;
+ final double tol = 1e-12;
Assert.assertEquals(0.0, dist.cumulativeProbability(1), tol);
Assert.assertEquals(0.2, dist.cumulativeProbability(2), tol);
Assert.assertEquals(0.6, dist.cumulativeProbability(10), tol);
@@ -616,11 +420,12 @@
@Test
public void testKernelOverrideUniform() {
- final EmpiricalDistribution dist = new UniformKernelEmpiricalDistribution(5);
- final double[] data = {1d,2d,3d, 4d,5d,6d, 7d,8d,9d, 10d,11d,12d, 13d,14d,15d};
- dist.load(data);
- ContinuousDistribution.Sampler sampler
- = dist.createSampler(RandomSource.create(RandomSource.WELL_19937_C, 1000));
+ final double[] data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ final EmpiricalDistribution dist =
+ EmpiricalDistribution.from(5, data,
+ s -> new UniformContinuousDistribution(s.getMin(), s.getMax()));
+ final ContinuousDistribution.Sampler sampler
+ = dist.createSampler(RandomSource.WELL_19937_C.create(1000));
// Kernels are uniform distributions on [1,3], [4,6], [7,9], [10,12], [13,15]
final double bounds[] = {3d, 6d, 9d, 12d};
final double tol = 10E-12;
@@ -648,7 +453,7 @@
@Test
public void testMath1431() {
- final UniformRandomProvider rng = RandomSource.create(RandomSource.WELL_19937_C, 1000);
+ final UniformRandomProvider rng = RandomSource.WELL_19937_C.create(1000);
final ContinuousDistribution.Sampler exponentialDistributionSampler
= new ExponentialDistribution(0.05).createSampler(rng);
final double[] empiricalDataPoints = new double[3000];
@@ -656,8 +461,7 @@
empiricalDataPoints[i] = exponentialDistributionSampler.sample();
}
- final EmpiricalDistribution testDistribution = new EmpiricalDistribution(100);
- testDistribution.load(empiricalDataPoints);
+ final EmpiricalDistribution testDistribution = EmpiricalDistribution.from(100, empiricalDataPoints);
for (int i = 0; i < 1000; i++) {
final double point = rng.nextDouble();
@@ -684,8 +488,7 @@
6461.3944, 6384.1345
};
- final EmpiricalDistribution ed = new EmpiricalDistribution(data.length / 10);
- ed.load(data);
+ final EmpiricalDistribution ed = EmpiricalDistribution.from(data.length / 10, data);
double v;
double p;
@@ -702,33 +505,4 @@
v = ed.inverseCumulativeProbability(p);
Assert.assertTrue("p=" + p + " => v=" + v, v < 6350);
}
-
- /**
- * Empirical distribution using a constant smoothing kernel.
- */
- private class ConstantKernelEmpiricalDistribution extends EmpiricalDistribution {
- private static final long serialVersionUID = 1L;
- ConstantKernelEmpiricalDistribution(int i) {
- super(i);
- }
- // Use constant distribution equal to bin mean within bin
- @Override
- protected ContinuousDistribution getKernel(SummaryStatistics bStats) {
- return new ConstantContinuousDistribution(bStats.getMean());
- }
- }
-
- /**
- * Empirical distribution using a uniform smoothing kernel.
- */
- private class UniformKernelEmpiricalDistribution extends EmpiricalDistribution {
- private static final long serialVersionUID = 2963149194515159653L;
- UniformKernelEmpiricalDistribution(int i) {
- super(i);
- }
- @Override
- protected ContinuousDistribution getKernel(SummaryStatistics bStats) {
- return new UniformContinuousDistribution(bStats.getMin(), bStats.getMax());
- }
- }
}