blob: 841e3310808497a306a0ff1e31df19be582c081c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.numbers.core.Sum;
import org.apache.commons.statistics.distribution.FDistribution;
/**
* Implements one-way ANOVA (analysis of variance) statistics.
*
* <p>Tests for differences between two or more categories of univariate data
* (for example, the body mass index of accountants, lawyers, doctors and
* computer programmers). When two categories are given, this is equivalent to
* the {@link TTest}.
*
* <p>This implementation computes the F statistic using the definitional formula:
*
* <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \]
*
* @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a>
* @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems">
* Multiple-comparison ANOVA problems (Wikipedia)</a>
* @see <a href="https://www.biostathandbook.com/onewayanova.html">
* McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland.
* One-way anova. pp 145-156.</a>
* @since 1.1
*/
public final class OneWayAnova {
/** Default instance. */
private static final OneWayAnova DEFAULT = new OneWayAnova();
/**
* Result for the one-way ANOVA.
*
* <p>This class is immutable.
*
* @since 1.1
*/
public static final class Result extends BaseSignificanceResult {
/** Degrees of freedom in numerator (between groups). */
private final int dfbg;
/** Degrees of freedom in denominator (within groups). */
private final long dfwg;
/** Mean square between groups. */
private final double msbg;
/** Mean square within groups. */
private final double mswg;
/** nO value used to partition the variance. */
private final double nO;
/**
* @param dfbg Degrees of freedom in numerator (between groups).
* @param dfwg Degrees of freedom in denominator (within groups).
* @param msbg Mean square between groups.
* @param mswg Mean square within groups.
* @param nO Factor for partitioning the variance.
* @param f F statistic
* @param p P-value.
*/
Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) {
super(f, p);
this.dfbg = dfbg;
this.dfwg = dfwg;
this.msbg = msbg;
this.mswg = mswg;
this.nO = nO;
}
/**
* Gets the degrees of freedom in the numerator (between groups).
*
* @return degrees of freedom between groups
*/
int getDFBG() {
return dfbg;
}
/**
* Gets the degrees of freedom in the denominator (within groups).
*
* @return degrees of freedom within groups
*/
long getDFWG() {
return dfwg;
}
/**
* Gets the mean square between groups.
*
* @return mean square between groups
*/
public double getMSBG() {
return msbg;
}
/**
* Gets the mean square within groups.
*
* @return mean square within groups
*/
public double getMSWG() {
return mswg;
}
/**
* Gets the variance component between groups.
*
* <p>The value is a partitioning of the variance.
* It is the complement of {@link #getVCWG()}.
*
* <p>Partitioning the variance applies only to a model II
* (random effects) one-way anova. This applies when the
* groups are random samples from a larger set of groups;
* partitioning the variance allows comparison of the
* variation between groups to the variation within groups.
*
* <p>If the {@linkplain #getMSBG() MSBG} is less than the
* {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this
* creates an estimate of the added variance component
* between groups as:
*
* <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \]
*
* <p>where \( n_o \) is a number close to, but usually less than,
* the arithmetic mean of the sample size \(n_i\) of each
* of the \( a \) groups:
*
* <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \]
*
* <p>The added variance component among groups \( A \) is expressed
* as a fraction of the total variance components \( A + B \) where
* \( B \) is the {@linkplain #getMSWG() MSWG}.
*
* @return variance component between groups (in [0, 1]).
*/
public double getVCBG() {
if (msbg <= mswg) {
return 0;
}
// a is an estimate of the between-group variance
final double a = (msbg - mswg) / nO;
final double b = mswg;
return a / (a + b);
}
/**
* Gets the variance component within groups.
*
* <p>The value is a partitioning of the variance.
* It is the complement of {@link #getVCBG()}. See
* that method for details.
*
* @return variance component within groups (in [0, 1]).
*/
public double getVCWG() {
if (msbg <= mswg) {
return 1;
}
final double a = (msbg - mswg) / nO;
final double b = mswg;
return b / (a + b);
}
}
/** Private constructor. */
private OneWayAnova() {
// Do nothing
}
/**
* Return an instance using the default options.
*
* @return default instance
*/
public static OneWayAnova withDefaults() {
return DEFAULT;
}
/**
* Computes the F statistic for an ANOVA test for a collection of category data,
* evaluating the null hypothesis that there is no difference among the means of
* the data categories.
*
* <p>Special cases:
* <ul>
* <li>If the value in each category is the same (no variance within groups) but different
* between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}.
* <li>If the value in every group is the same (no variance within or between groups),
* the f-value is {@link Double#NaN NaN}.
* </ul>
*
* @param data Category summary data.
* @return F statistic
* @throws IllegalArgumentException if the number of categories is less than
* two; a contained category does not have at least one value; or all
* categories have only one value (zero degrees of freedom within groups)
*/
public double statistic(Collection<double[]> data) {
final double[] f = new double[1];
aov(data, f);
return f[0];
}
/**
* Performs an ANOVA test for a collection of category data,
* evaluating the null hypothesis that there is no difference among the means of
* the data categories.
*
* <p>Special cases:
* <ul>
* <li>If the value in each category is the same (no variance within groups) but different
* between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero.
* <li>If the value in every group is the same (no variance within or between groups),
* the f-value and p-value are {@link Double#NaN NaN}.
* </ul>
*
* @param data Category summary data.
* @return test result
* @throws IllegalArgumentException if the number of categories is less than
* two; a contained category does not have at least one value; or all
* categories have only one value (zero degrees of freedom within groups)
*/
public Result test(Collection<double[]> data) {
return aov(data, null);
}
/**
* Performs an ANOVA test for a collection of category data, evaluating the null
* hypothesis that there is no difference among the means of the data categories.
*
* <p>This is a utility method to allow computation of the F statistic without
* the p-value or partitioning of the variance. If the {@code statistic} is not null
* the method will record the F statistic in the array and return null.
*
* @param data Category summary data.
* @param statistic Result for the F statistic (or null).
* @return test result (or null)
* @throws IllegalArgumentException if the number of categories is less than two; a
* contained category does not have at least one value; or all categories have only
* one value (zero degrees of freedom within groups)
*/
private static Result aov(Collection<double[]> data, double[] statistic) {
Arguments.checkCategoriesRequiredSize(data.size(), 2);
long n = 0;
for (final double[] array : data) {
n += array.length;
Arguments.checkValuesRequiredSize(array.length, 1);
}
final long dfwg = n - data.size();
if (dfwg == 0) {
throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
}
// wg = within group
// bg = between group
// F = Var(bg) / Var(wg)
// Var = SS / df
// SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
// = SSwg + SSbg
// Some cancellation of terms reduces the computation to 3 sums:
// SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
// SSbg = SStotal - SSwg
// = sum_g { sum(x)^2/n) } - sum(x)^2/n
// SSwg = SStotal - SSbg
// = sum(x^2) - sum_g { sum(x)^2/n) }
// Stabilize the computation by shifting all to a common mean of zero.
// This minimise the magnitude of x^2 terms.
// The terms sum(x)^2/n -> 0. Included them to capture the round-off.
final double m = StatisticUtils.mean(data);
final Sum sxx = Sum.create();
final Sum sx = Sum.create();
final Sum sg = Sum.create();
// Track if each group had the same value
boolean eachSame = true;
for (final double[] array : data) {
eachSame = eachSame && allMatch(array[0], array);
final Sum s = Sum.create();
for (final double v : array) {
final double x = v - m;
s.add(x);
// sum(x)
sx.add(x);
// sum(x^2)
sxx.add(x * x);
}
// Create the negative sum so we can subtract it via 'add'
// -sum_g { sum(x)^2/n) }
sg.add(-Math.pow(s.getAsDouble(), 2) / array.length);
}
// Note: SS terms should not be negative given:
// SS = sum((x - u)^2)
// This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
// Flip the sign back
final double ssbg = Math.max(0, -sg.add(Math.pow(sx.getAsDouble(), 2) / n).getAsDouble());
final int dfbg = data.size() - 1;
// Handle edge-cases:
// Note: 0 / 0 -> NaN : x / 0 -> inf
// These are documented results and should output p=NaN or 0.
// This result will occur naturally.
// However the SS totals may not be 0.0 so we correct these cases.
final boolean allSame = eachSame && allMatch(data);
final double msbg = allSame ? 0 : ssbg / dfbg;
final double mswg = eachSame ? 0 : sswg / dfwg;
final double f = msbg / mswg;
if (statistic != null) {
statistic[0] = f;
return null;
}
final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);
// Support partitioning the variance
// ni = size of each of the groups
// nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
final double nO = (n - data.stream()
.mapToDouble(x -> Math.pow(x.length, 2)).sum() / n) / dfbg;
return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
}
/**
* Return true if all values in the array match the specified value.
*
* @param v Value.
* @param a Array.
* @return true if all match
*/
private static boolean allMatch(double v, double[] a) {
for (final double w : a) {
if (v != w) {
return false;
}
}
return true;
}
/**
* Return true if all values in the arrays match.
*
* <p>Assumes that there are at least two arrays and that each array has the same
* value throughout. Thus only the first element in each array is checked.
*
* @param data Arrays.
* @return true if all match
*/
private static boolean allMatch(Collection<double[]> data) {
final Iterator<double[]> iter = data.iterator();
final double v = iter.next()[0];
while (iter.hasNext()) {
if (iter.next()[0] != v) {
return false;
}
}
return true;
}
}