blob: 2db1ee40f8e2a6f229c51ba9cc58aaf872f5c93a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math3.stat.descriptive;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.commons.math3.TestUtils;
import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.commons.math3.distribution.UniformRealDistribution;
import org.apache.commons.math3.distribution.IntegerDistribution;
import org.apache.commons.math3.distribution.UniformIntegerDistribution;
import org.apache.commons.math3.util.Precision;
import org.junit.Assert;
import org.junit.Test;
/**
* Test cases for {@link AggregateSummaryStatistics}
*/
public class AggregateSummaryStatisticsTest {
/**
* Tests the standard aggregation behavior
*/
@Test
public void testAggregation() {
AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
SummaryStatistics setOneStats = aggregate.createContributingStatistics();
SummaryStatistics setTwoStats = aggregate.createContributingStatistics();
Assert.assertNotNull("The set one contributing stats are null", setOneStats);
Assert.assertNotNull("The set two contributing stats are null", setTwoStats);
Assert.assertNotSame("Contributing stats objects are the same", setOneStats, setTwoStats);
setOneStats.addValue(2);
setOneStats.addValue(3);
setOneStats.addValue(5);
setOneStats.addValue(7);
setOneStats.addValue(11);
Assert.assertEquals("Wrong number of set one values", 5, setOneStats.getN());
Assert.assertTrue("Wrong sum of set one values", Precision.equals(28.0, setOneStats.getSum(), 1));
setTwoStats.addValue(2);
setTwoStats.addValue(4);
setTwoStats.addValue(8);
Assert.assertEquals("Wrong number of set two values", 3, setTwoStats.getN());
Assert.assertTrue("Wrong sum of set two values", Precision.equals(14.0, setTwoStats.getSum(), 1));
Assert.assertEquals("Wrong number of aggregate values", 8, aggregate.getN());
Assert.assertTrue("Wrong aggregate sum", Precision.equals(42.0, aggregate.getSum(), 1));
}
/**
* Verify that aggregating over a partition gives the same results
* as direct computation.
*
* 1) Randomly generate a dataset of 10-100 values
* from [-100, 100]
* 2) Divide the dataset it into 2-5 partitions
* 3) Create an AggregateSummaryStatistic and ContributingStatistics
* for each partition
* 4) Compare results from the AggregateSummaryStatistic with values
* returned by a single SummaryStatistics instance that is provided
* the full dataset
*/
@Test
public void testAggregationConsistency() {
// Generate a random sample and random partition
double[] totalSample = generateSample();
double[][] subSamples = generatePartition(totalSample);
int nSamples = subSamples.length;
// Create aggregator and total stats for comparison
AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
SummaryStatistics totalStats = new SummaryStatistics();
// Create array of component stats
SummaryStatistics componentStats[] = new SummaryStatistics[nSamples];
for (int i = 0; i < nSamples; i++) {
// Make componentStats[i] a contributing statistic to aggregate
componentStats[i] = aggregate.createContributingStatistics();
// Add values from subsample
for (int j = 0; j < subSamples[i].length; j++) {
componentStats[i].addValue(subSamples[i][j]);
}
}
// Compute totalStats directly
for (int i = 0; i < totalSample.length; i++) {
totalStats.addValue(totalSample[i]);
}
/*
* Compare statistics in totalStats with aggregate.
* Note that guaranteed success of this comparison depends on the
* fact that <aggregate> gets values in exactly the same order
* as <totalStats>.
*
*/
Assert.assertEquals(totalStats.getSummary(), aggregate.getSummary());
}
/**
* Test aggregate function by randomly generating a dataset of 10-100 values
* from [-100, 100], dividing it into 2-5 partitions, computing stats for each
* partition and comparing the result of aggregate(...) applied to the collection
* of per-partition SummaryStatistics with a single SummaryStatistics computed
* over the full sample.
*/
@Test
public void testAggregate() {
// Generate a random sample and random partition
double[] totalSample = generateSample();
double[][] subSamples = generatePartition(totalSample);
int nSamples = subSamples.length;
// Compute combined stats directly
SummaryStatistics totalStats = new SummaryStatistics();
for (int i = 0; i < totalSample.length; i++) {
totalStats.addValue(totalSample[i]);
}
// Now compute subsample stats individually and aggregate
SummaryStatistics[] subSampleStats = new SummaryStatistics[nSamples];
for (int i = 0; i < nSamples; i++) {
subSampleStats[i] = new SummaryStatistics();
}
Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
for (int i = 0; i < nSamples; i++) {
for (int j = 0; j < subSamples[i].length; j++) {
subSampleStats[i].addValue(subSamples[i][j]);
}
aggregate.add(subSampleStats[i]);
}
// Compare values
StatisticalSummary aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
}
/**
* Similar to {@link #testAggregate()} but operating on
* {@link StatisticalSummary} instead.
*/
@Test
public void testAggregateStatisticalSummary() {
// Generate a random sample and random partition
double[] totalSample = generateSample();
double[][] subSamples = generatePartition(totalSample);
int nSamples = subSamples.length;
// Compute combined stats directly
SummaryStatistics totalStats = new SummaryStatistics();
for (int i = 0; i < totalSample.length; i++) {
totalStats.addValue(totalSample[i]);
}
// Now compute subsample stats individually and aggregate
SummaryStatistics[] subSampleStats = new SummaryStatistics[nSamples];
for (int i = 0; i < nSamples; i++) {
subSampleStats[i] = new SummaryStatistics();
}
Collection<StatisticalSummary> aggregate = new ArrayList<StatisticalSummary>();
for (int i = 0; i < nSamples; i++) {
for (int j = 0; j < subSamples[i].length; j++) {
subSampleStats[i].addValue(subSamples[i][j]);
}
aggregate.add(subSampleStats[i].getSummary());
}
// Compare values
StatisticalSummary aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
}
@Test
public void testAggregateDegenerate() {
double[] totalSample = {1, 2, 3, 4, 5};
double[][] subSamples = {{1}, {2}, {3}, {4}, {5}};
// Compute combined stats directly
SummaryStatistics totalStats = new SummaryStatistics();
for (int i = 0; i < totalSample.length; i++) {
totalStats.addValue(totalSample[i]);
}
// Now compute subsample stats individually and aggregate
SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
for (int i = 0; i < 5; i++) {
subSampleStats[i] = new SummaryStatistics();
}
Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
for (int i = 0; i < 5; i++) {
for (int j = 0; j < subSamples[i].length; j++) {
subSampleStats[i].addValue(subSamples[i][j]);
}
aggregate.add(subSampleStats[i]);
}
// Compare values
StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
}
@Test
public void testAggregateSpecialValues() {
double[] totalSample = {Double.POSITIVE_INFINITY, 2, 3, Double.NaN, 5};
double[][] subSamples = {{Double.POSITIVE_INFINITY, 2}, {3}, {Double.NaN}, {5}};
// Compute combined stats directly
SummaryStatistics totalStats = new SummaryStatistics();
for (int i = 0; i < totalSample.length; i++) {
totalStats.addValue(totalSample[i]);
}
// Now compute subsample stats individually and aggregate
SummaryStatistics[] subSampleStats = new SummaryStatistics[5];
for (int i = 0; i < 4; i++) {
subSampleStats[i] = new SummaryStatistics();
}
Collection<SummaryStatistics> aggregate = new ArrayList<SummaryStatistics>();
for (int i = 0; i < 4; i++) {
for (int j = 0; j < subSamples[i].length; j++) {
subSampleStats[i].addValue(subSamples[i][j]);
}
aggregate.add(subSampleStats[i]);
}
// Compare values
StatisticalSummaryValues aggregatedStats = AggregateSummaryStatistics.aggregate(aggregate);
assertEquals(totalStats.getSummary(), aggregatedStats, 10E-12);
}
/**
* Verifies that a StatisticalSummary and a StatisticalSummaryValues are equal up
* to delta, with NaNs, infinities returned in the same spots. For max, min, n, values
* have to agree exactly, delta is used only for sum, mean, variance, std dev.
*/
protected static void assertEquals(StatisticalSummary expected, StatisticalSummary observed, double delta) {
TestUtils.assertEquals(expected.getMax(), observed.getMax(), 0);
TestUtils.assertEquals(expected.getMin(), observed.getMin(), 0);
Assert.assertEquals(expected.getN(), observed.getN());
TestUtils.assertEquals(expected.getSum(), observed.getSum(), delta);
TestUtils.assertEquals(expected.getMean(), observed.getMean(), delta);
TestUtils.assertEquals(expected.getStandardDeviation(), observed.getStandardDeviation(), delta);
TestUtils.assertEquals(expected.getVariance(), observed.getVariance(), delta);
}
/**
* Generates a random sample of double values.
* Sample size is random, between 10 and 100 and values are
* uniformly distributed over [-100, 100].
*
* @return array of random double values
*/
private double[] generateSample() {
final IntegerDistribution size = new UniformIntegerDistribution(10, 100);
final RealDistribution randomData = new UniformRealDistribution(-100, 100);
final int sampleSize = size.sample();
final double[] out = randomData.sample(sampleSize);
return out;
}
/**
* Generates a partition of <sample> into up to 5 sequentially selected
* subsamples with randomly selected partition points.
*
* @param sample array to partition
* @return rectangular array with rows = subsamples
*/
private double[][] generatePartition(double[] sample) {
final int length = sample.length;
final double[][] out = new double[5][];
int cur = 0; // beginning of current partition segment
int offset = 0; // end of current partition segment
int sampleCount = 0; // number of segments defined
for (int i = 0; i < 5; i++) {
if (cur == length || offset == length) {
break;
}
final int next;
if (i == 4 || cur == length - 1) {
next = length - 1;
} else {
next = (new UniformIntegerDistribution(cur, length - 1)).sample();
}
final int subLength = next - cur + 1;
out[i] = new double[subLength];
System.arraycopy(sample, offset, out[i], 0, subLength);
cur = next + 1;
sampleCount++;
offset += subLength;
}
if (sampleCount < 5) {
double[][] out2 = new double[sampleCount][];
for (int j = 0; j < sampleCount; j++) {
final int curSize = out[j].length;
out2[j] = new double[curSize];
System.arraycopy(out[j], 0, out2[j], 0, curSize);
}
return out2;
} else {
return out;
}
}
}