blob: ded63a4bdf8beed23856910d8dc40d86ceab9413 [file] [log] [blame]
/*
* Copyright 2003-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.random;
import java.io.Serializable;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
import org.apache.commons.math.stat.descriptive.StatisticalSummary;
/**
* Implements <code>EmpiricalDistribution</code> interface. This implementation
* uses what amounts to the
* <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
* Variable Kernel Method</a> with Gaussian smoothing:<p>
* <strong>Digesting the input file</strong>
* <ol><li>Pass the file once to compute min and max.</li>
* <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
* <li>Pass the data file again, computing bin counts and univariate
* statistics (mean, std dev.) for each of the bins </li>
* <li>Divide the interval (0,1) into subintervals associated with the bins,
* with the length of a bin's subinterval proportional to its count.</li></ol>
* <strong>Generating random values from the distribution</strong><ol>
* <li>Generate a uniformly distributed value in (0,1) </li>
* <li>Select the subinterval to which the value belongs.
* <li>Generate a random Gaussian value with mean = mean of the associated
* bin and std dev = std dev of associated bin.</li></ol></p><p>
*<strong>USAGE NOTES:</strong><ul>
*<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
* is to set the bin count to approximately the length of the input file divided
* by 10. </li>
*<li>The input file <i>must</i> be a plain text file containing one valid numeric
* entry per line.</li>
* </ul></p>
*
* @version $Revision$ $Date$
*/
public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
/** Serializable version identifier */
static final long serialVersionUID = -6773236347582113490L;
/** List of SummaryStatistics objects characterizing the bins */
private ArrayList binStats = null;
/** Sample statistics */
SummaryStatistics sampleStats = null;
/** number of bins */
private int binCount = 1000;
/** is the distribution loaded? */
private boolean loaded = false;
/** upper bounds of subintervals in (0,1) "belonging" to the bins */
private double[] upperBounds = null;
/** RandomData instance to use in repeated calls to getNext() */
private RandomData randomData = new RandomDataImpl();
/**
* Creates a new EmpiricalDistribution with the default bin count.
*/
public EmpiricalDistributionImpl() {
binStats = new ArrayList();
}
/**
* Creates a new EmpiricalDistribution with the specified bin count.
*
* @param binCount number of bins
*/
public EmpiricalDistributionImpl(int binCount) {
this.binCount = binCount;
binStats = new ArrayList();
}
/**
* Computes the empirical distribution from the provided
* array of numbers.
*
* @param in the input data array
*/
public void load(double[] in) {
DataAdapter da = new ArrayDataAdapter(in);
try {
da.computeStats();
fillBinStats(in);
} catch (Exception e) {
throw new RuntimeException(e.getMessage());
}
loaded = true;
}
/**
* Computes the empirical distribution using data read from a URL.
* @param url url of the input file
*
* @throws IOException if an IO error occurs
*/
public void load(URL url) throws IOException {
BufferedReader in =
new BufferedReader(new InputStreamReader(url.openStream()));
try {
DataAdapter da = new StreamDataAdapter(in);
try {
da.computeStats();
} catch (Exception e) {
throw new IOException(e.getMessage());
}
in = new BufferedReader(new InputStreamReader(url.openStream()));
fillBinStats(in);
loaded = true;
} finally {
if (in != null) {
try {
in.close();
} catch (Exception ex) {
// ignore
}
}
}
}
/**
* Computes the empirical distribution from the input file.
*
* @param file the input file
* @throws IOException if an IO error occurs
*/
public void load(File file) throws IOException {
BufferedReader in = new BufferedReader(new FileReader(file));
try {
DataAdapter da = new StreamDataAdapter(in);
try {
da.computeStats();
} catch (Exception e) {
throw new IOException(e.getMessage());
}
in = new BufferedReader(new FileReader(file));
fillBinStats(in);
loaded = true;
} finally {
if (in != null) {
try {
in.close();
} catch (Exception ex) {
// ignore
}
}
}
}
/**
* Provides methods for computing <code>sampleStats</code> and
* <code>beanStats</code> abstracting the source of data.
*/
private abstract class DataAdapter{
/**
* Compute bin stats.
*
* @param min minimum value
* @param delta grid size
* @throws Exception if an error occurs computing bin stats
*/
public abstract void computeBinStats(double min, double delta)
throws Exception;
/**
* Compute sample statistics.
*
* @throws Exception if an error occurs computing sample stats
*/
public abstract void computeStats() throws Exception;
}
/**
* Factory of <code>DataAdapter</code> objects. For every supported source
* of data (array of doubles, file, etc.) an instance of the proper object
* is returned.
*/
private class DataAdapterFactory{
/**
* Creates a DataAdapter from a data object
*
* @param in object providing access to the data
* @return DataAdapter instance
*/
public DataAdapter getAdapter(Object in) {
if (in instanceof BufferedReader) {
BufferedReader inputStream = (BufferedReader) in;
return new StreamDataAdapter(inputStream);
} else if (in instanceof double[]) {
double[] inputArray = (double[]) in;
return new ArrayDataAdapter(inputArray);
} else {
throw new IllegalArgumentException(
"Input data comes from the" + " unsupported source");
}
}
}
/**
* <code>DataAdapter</code> for data provided through some input stream
*/
private class StreamDataAdapter extends DataAdapter{
/** Input stream providng access to the data */
BufferedReader inputStream;
/**
* Create a StreamDataAdapter from a BufferedReader
*
* @param in BufferedReader input stream
*/
public StreamDataAdapter(BufferedReader in){
super();
inputStream = in;
}
/**
* Computes binStats
*
* @param min minimum value
* @param delta grid size
* @throws IOException if an IO error occurs
*/
public void computeBinStats(double min, double delta)
throws IOException {
String str = null;
double val = 0.0d;
while ((str = inputStream.readLine()) != null) {
val = Double.parseDouble(str);
SummaryStatistics stats =
(SummaryStatistics) binStats.get(
Math.max((int) Math.ceil((val - min) / delta) - 1, 0));
stats.addValue(val);
}
inputStream.close();
inputStream = null;
}
/**
* Computes sampleStats
*
* @throws IOException if an IOError occurs
*/
public void computeStats() throws IOException {
String str = null;
double val = 0.0;
sampleStats = SummaryStatistics.newInstance();
while ((str = inputStream.readLine()) != null) {
val = new Double(str).doubleValue();
sampleStats.addValue(val);
}
inputStream.close();
inputStream = null;
}
}
/**
* <code>DataAdapter</code> for data provided as array of doubles.
*/
private class ArrayDataAdapter extends DataAdapter{
/** Array of input data values */
private double[] inputArray;
/**
* Construct an ArrayDataAdapter from a double[] array
*
* @param in double[] array holding the data
*/
public ArrayDataAdapter(double[] in){
super();
inputArray = in;
}
/**
* Computes sampleStats
*
* @throws IOException if an IO error occurs
*/
public void computeStats() throws IOException {
sampleStats = SummaryStatistics.newInstance();
for (int i = 0; i < inputArray.length; i++) {
sampleStats.addValue(inputArray[i]);
}
}
/**
* Computes binStats
*
* @param min minimum value
* @param delta grid size
* @throws IOException if an IO error occurs
*/
public void computeBinStats(double min, double delta)
throws IOException {
for (int i = 0; i < inputArray.length; i++) {
SummaryStatistics stats =
(SummaryStatistics) binStats.get(
Math.max((int) Math.ceil(
(inputArray[i] - min) / delta)- 1, 0));
stats.addValue(inputArray[i]);
}
}
}
/**
* Fills binStats array (second pass through data file).
*
* @param in object providing access to the data
* @throws IOException if an IO error occurs
*/
private void fillBinStats(Object in) throws IOException {
// Load array of bin upper bounds -- evenly spaced from min - max
double min = sampleStats.getMin();
double max = sampleStats.getMax();
double delta = (max - min)/(new Double(binCount)).doubleValue();
double[] binUpperBounds = new double[binCount];
binUpperBounds[0] = min + delta;
for (int i = 1; i< binCount - 1; i++) {
binUpperBounds[i] = binUpperBounds[i-1] + delta;
}
binUpperBounds[binCount -1] = max;
// Initialize binStats ArrayList
if (!binStats.isEmpty()) {
binStats.clear();
}
for (int i = 0; i < binCount; i++) {
SummaryStatistics stats = SummaryStatistics.newInstance();
binStats.add(i,stats);
}
// Filling data in binStats Array
DataAdapterFactory aFactory = new DataAdapterFactory();
DataAdapter da = aFactory.getAdapter(in);
try {
da.computeBinStats(min, delta);
} catch (Exception e) {
if(e instanceof RuntimeException){
throw new RuntimeException(e.getMessage());
}else{
throw new IOException(e.getMessage());
}
}
// Assign upperBounds based on bin counts
upperBounds = new double[binCount];
upperBounds[0] =
((double)((SummaryStatistics)binStats.get(0)).getN())/
(double)sampleStats.getN();
for (int i = 1; i < binCount-1; i++) {
upperBounds[i] = upperBounds[i-1] +
((double)((SummaryStatistics)binStats.get(i)).getN())/
(double)sampleStats.getN();
}
upperBounds[binCount-1] = 1.0d;
}
/**
* Generates a random value from this distribution.
*
* @return the random value.
* @throws IllegalStateException if the distribution has not been loaded
*/
public double getNextValue() throws IllegalStateException {
if (!loaded) {
throw new IllegalStateException("distribution not loaded");
}
// Start with a uniformly distributed random number in (0,1)
double x = Math.random();
// Use this to select the bin and generate a Gaussian within the bin
for (int i = 0; i < binCount; i++) {
if (x <= upperBounds[i]) {
SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
if (stats.getN() > 0) {
if (stats.getStandardDeviation() > 0) { // more than one obs
return randomData.nextGaussian
(stats.getMean(),stats.getStandardDeviation());
} else {
return stats.getMean(); // only one obs in bin
}
}
}
}
throw new RuntimeException("No bin selected");
}
/**
* Returns a {@link StatisticalSummary} describing this distribution.
* <strong>Preconditions:</strong><ul>
* <li>the distribution must be loaded before invoking this method</li></ul>
*
* @return the sample statistics
* @throws IllegalStateException if the distribution has not been loaded
*/
public StatisticalSummary getSampleStats() {
return sampleStats;
}
/**
* Returns the number of bins.
*
* @return the number of bins.
*/
public int getBinCount() {
return binCount;
}
/**
* Returns an ArrayList of {@link SummaryStatistics} instances containing
* statistics describing the values in each of the bins. The ArrayList is
* indexed on the bin number.
*
* @return List of bin statistics.
*/
public List getBinStats() {
return binStats;
}
/**
* Returns the array of upper bounds for the bins. Bins are: <br/>
* [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
* (upperBounds[binCount-1],max]
*
* @return array of bin upper bounds
*/
public double[] getUpperBounds() {
return upperBounds;
}
/**
* Property indicating whether or not the distribution has been loaded.
*
* @return true if the distribution has been loaded
*/
public boolean isLoaded() {
return loaded;
}
}