| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.math4.legacy.stat.inference; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| |
| import org.apache.commons.statistics.distribution.FDistribution; |
| import org.apache.commons.math4.legacy.exception.ConvergenceException; |
| import org.apache.commons.math4.legacy.exception.DimensionMismatchException; |
| import org.apache.commons.math4.legacy.exception.MaxCountExceededException; |
| import org.apache.commons.math4.legacy.exception.NullArgumentException; |
| import org.apache.commons.math4.legacy.exception.OutOfRangeException; |
| import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; |
| import org.apache.commons.math4.legacy.stat.descriptive.SummaryStatistics; |
| |
| /** |
| * Implements one-way ANOVA (analysis of variance) statistics. |
| * |
| * <p> Tests for differences between two or more categories of univariate data |
| * (for example, the body mass index of accountants, lawyers, doctors and |
| * computer programmers). When two categories are given, this is equivalent to |
| * the {@link org.apache.commons.math4.legacy.stat.inference.TTest}. |
| * </p><p> |
| * Uses the {@link org.apache.commons.statistics.distribution.FDistribution |
| * commons-math F Distribution implementation} to estimate exact p-values.</p> |
| * <p>This implementation is based on a description at |
| * http://faculty.vassar.edu/lowry/ch13pt1.html</p> |
| * <pre> |
| * Abbreviations: bg = between groups, |
| * wg = within groups, |
| * ss = sum squared deviations |
| * </pre> |
| * |
| * @since 1.2 |
| */ |
| public class OneWayAnova { |
| |
| /** |
| * Default constructor. |
| */ |
| public OneWayAnova() { |
| } |
| |
| /** |
| * Computes the ANOVA F-value for a collection of <code>double[]</code> |
| * arrays. |
| * |
| * <p><strong>Preconditions</strong>: <ul> |
| * <li>The categoryData <code>Collection</code> must contain |
| * <code>double[]</code> arrays.</li> |
| * <li> There must be at least two <code>double[]</code> arrays in the |
| * <code>categoryData</code> collection and each of these arrays must |
| * contain at least two values.</li></ul><p> |
| * This implementation computes the F statistic using the definitional |
| * formula<pre> |
| * F = msbg/mswg</pre> |
| * where<pre> |
| * msbg = between group mean square |
| * mswg = within group mean square</pre> |
| * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html"> |
| * here</a> |
| * |
| * @param categoryData <code>Collection</code> of <code>double[]</code> |
| * arrays each containing data for one category |
| * @return Fvalue |
| * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException if the length of the <code>categoryData</code> |
| * array is less than 2 or a contained <code>double[]</code> array does not have |
| * at least two values |
| */ |
| public double anovaFValue(final Collection<double[]> categoryData) |
| throws NullArgumentException, DimensionMismatchException { |
| |
| AnovaStats a = anovaStats(categoryData); |
| return a.f; |
| |
| } |
| |
| /** |
| * Computes the ANOVA P-value for a collection of <code>double[]</code> |
| * arrays. |
| * |
| * <p><strong>Preconditions</strong>: <ul> |
| * <li>The categoryData <code>Collection</code> must contain |
| * <code>double[]</code> arrays.</li> |
| * <li> There must be at least two <code>double[]</code> arrays in the |
| * <code>categoryData</code> collection and each of these arrays must |
| * contain at least two values.</li></ul><p> |
| * This implementation uses the |
| * {@link org.apache.commons.statistics.distribution.FDistribution |
| * commons-math F Distribution implementation} to estimate the exact |
| * p-value, using the formula<pre> |
| * p = 1 - cumulativeProbability(F)</pre> |
| * where <code>F</code> is the F value and <code>cumulativeProbability</code> |
| * is the commons-math implementation of the F distribution. |
| * |
| * @param categoryData <code>Collection</code> of <code>double[]</code> |
| * arrays each containing data for one category |
| * @return Pvalue |
| * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException if the length of the <code>categoryData</code> |
| * array is less than 2 or a contained <code>double[]</code> array does not have |
| * at least two values |
| * @throws ConvergenceException if the p-value can not be computed due to a convergence error |
| * @throws MaxCountExceededException if the maximum number of iterations is exceeded |
| */ |
| public double anovaPValue(final Collection<double[]> categoryData) |
| throws NullArgumentException, DimensionMismatchException, |
| ConvergenceException, MaxCountExceededException { |
| |
| final AnovaStats a = anovaStats(categoryData); |
| // No try-catch or advertised exception because args are valid |
| // pass a null rng to avoid unneeded overhead as we will not sample from this distribution |
| final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg); |
| return 1.0 - fdist.cumulativeProbability(a.f); |
| |
| } |
| |
| /** |
| * Computes the ANOVA P-value for a collection of {@link SummaryStatistics}. |
| * |
| * <p><strong>Preconditions</strong>: <ul> |
| * <li>The categoryData <code>Collection</code> must contain |
| * {@link SummaryStatistics}.</li> |
| * <li> There must be at least two {@link SummaryStatistics} in the |
| * <code>categoryData</code> collection and each of these statistics must |
| * contain at least two values.</li></ul><p> |
| * This implementation uses the |
| * {@link org.apache.commons.statistics.distribution.FDistribution |
| * commons-math F Distribution implementation} to estimate the exact |
| * p-value, using the formula<pre> |
| * p = 1 - cumulativeProbability(F)</pre> |
| * where <code>F</code> is the F value and <code>cumulativeProbability</code> |
| * is the commons-math implementation of the F distribution. |
| * |
| * @param categoryData <code>Collection</code> of {@link SummaryStatistics} |
| * each containing data for one category |
| * @param allowOneElementData if true, allow computation for one catagory |
| * only or for one data element per category |
| * @return Pvalue |
| * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException if the length of the <code>categoryData</code> |
| * array is less than 2 or a contained {@link SummaryStatistics} does not have |
| * at least two values |
| * @throws ConvergenceException if the p-value can not be computed due to a convergence error |
| * @throws MaxCountExceededException if the maximum number of iterations is exceeded |
| * @since 3.2 |
| */ |
| public double anovaPValue(final Collection<SummaryStatistics> categoryData, |
| final boolean allowOneElementData) |
| throws NullArgumentException, DimensionMismatchException, |
| ConvergenceException, MaxCountExceededException { |
| |
| final AnovaStats a = anovaStats(categoryData, allowOneElementData); |
| // pass a null rng to avoid unneeded overhead as we will not sample from this distribution |
| final FDistribution fdist = FDistribution.of(a.dfbg, a.dfwg); |
| return 1.0 - fdist.cumulativeProbability(a.f); |
| |
| } |
| |
| /** |
| * This method calls the method that actually does the calculations (except |
| * P-value). |
| * |
| * @param categoryData |
| * <code>Collection</code> of <code>double[]</code> arrays each |
| * containing data for one category |
| * @return computed AnovaStats |
| * @throws NullArgumentException |
| * if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException |
| * if the length of the <code>categoryData</code> array is less |
| * than 2 or a contained <code>double[]</code> array does not |
| * contain at least two values |
| */ |
| private AnovaStats anovaStats(final Collection<double[]> categoryData) |
| throws NullArgumentException, DimensionMismatchException { |
| |
| NullArgumentException.check(categoryData); |
| |
| final Collection<SummaryStatistics> categoryDataSummaryStatistics = |
| new ArrayList<>(categoryData.size()); |
| |
| // convert arrays to SummaryStatistics |
| for (final double[] data : categoryData) { |
| final SummaryStatistics dataSummaryStatistics = new SummaryStatistics(); |
| categoryDataSummaryStatistics.add(dataSummaryStatistics); |
| for (final double val : data) { |
| dataSummaryStatistics.addValue(val); |
| } |
| } |
| |
| return anovaStats(categoryDataSummaryStatistics, false); |
| |
| } |
| |
| /** |
| * Performs an ANOVA test, evaluating the null hypothesis that there |
| * is no difference among the means of the data categories. |
| * |
| * <p><strong>Preconditions</strong>: <ul> |
| * <li>The categoryData <code>Collection</code> must contain |
| * <code>double[]</code> arrays.</li> |
| * <li> There must be at least two <code>double[]</code> arrays in the |
| * <code>categoryData</code> collection and each of these arrays must |
| * contain at least two values.</li> |
| * <li>alpha must be strictly greater than 0 and less than or equal to 0.5. |
| * </li></ul><p> |
| * This implementation uses the |
| * {@link org.apache.commons.statistics.distribution.FDistribution |
| * commons-math F Distribution implementation} to estimate the exact |
| * p-value, using the formula<pre> |
| * p = 1 - cumulativeProbability(F)</pre> |
| * where <code>F</code> is the F value and <code>cumulativeProbability</code> |
| * is the commons-math implementation of the F distribution. |
| * <p>True is returned iff the estimated p-value is less than alpha.</p> |
| * |
| * @param categoryData <code>Collection</code> of <code>double[]</code> |
| * arrays each containing data for one category |
| * @param alpha significance level of the test |
| * @return true if the null hypothesis can be rejected with |
| * confidence 1 - alpha |
| * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException if the length of the <code>categoryData</code> |
| * array is less than 2 or a contained <code>double[]</code> array does not have |
| * at least two values |
| * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5] |
| * @throws ConvergenceException if the p-value can not be computed due to a convergence error |
| * @throws MaxCountExceededException if the maximum number of iterations is exceeded |
| */ |
| public boolean anovaTest(final Collection<double[]> categoryData, |
| final double alpha) |
| throws NullArgumentException, DimensionMismatchException, |
| OutOfRangeException, ConvergenceException, MaxCountExceededException { |
| |
| if (alpha <= 0 || alpha > 0.5) { |
| throw new OutOfRangeException( |
| LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, |
| alpha, 0, 0.5); |
| } |
| return anovaPValue(categoryData) < alpha; |
| |
| } |
| |
| /** |
| * This method actually does the calculations (except P-value). |
| * |
| * @param categoryData <code>Collection</code> of <code>double[]</code> |
| * arrays each containing data for one category |
| * @param allowOneElementData if true, allow computation for one catagory |
| * only or for one data element per category |
| * @return computed AnovaStats |
| * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> |
| * @throws DimensionMismatchException if <code>allowOneElementData</code> is false and the number of |
| * categories is less than 2 or a contained SummaryStatistics does not contain |
| * at least two values |
| */ |
| private AnovaStats anovaStats(final Collection<SummaryStatistics> categoryData, |
| final boolean allowOneElementData) |
| throws NullArgumentException, DimensionMismatchException { |
| |
| NullArgumentException.check(categoryData); |
| |
| if (!allowOneElementData) { |
| // check if we have enough categories |
| if (categoryData.size() < 2) { |
| throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, |
| categoryData.size(), 2); |
| } |
| |
| // check if each category has enough data |
| for (final SummaryStatistics array : categoryData) { |
| if (array.getN() <= 1) { |
| throw new DimensionMismatchException(LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, |
| (int) array.getN(), 2); |
| } |
| } |
| } |
| |
| int dfwg = 0; |
| double sswg = 0; |
| double totsum = 0; |
| double totsumsq = 0; |
| int totnum = 0; |
| |
| for (final SummaryStatistics data : categoryData) { |
| |
| final double sum = data.getSum(); |
| final double sumsq = data.getSumsq(); |
| final int num = (int) data.getN(); |
| totnum += num; |
| totsum += sum; |
| totsumsq += sumsq; |
| |
| dfwg += num - 1; |
| final double ss = sumsq - ((sum * sum) / num); |
| sswg += ss; |
| } |
| |
| final double sst = totsumsq - ((totsum * totsum) / totnum); |
| final double ssbg = sst - sswg; |
| final int dfbg = categoryData.size() - 1; |
| final double msbg = ssbg / dfbg; |
| final double mswg = sswg / dfwg; |
| final double f = msbg / mswg; |
| |
| return new AnovaStats(dfbg, dfwg, f); |
| |
| } |
| |
| /** |
| Convenience class to pass dfbg,dfwg,F values around within OneWayAnova. |
| No get/set methods provided. |
| */ |
| private static final class AnovaStats { |
| |
| /** Degrees of freedom in numerator (between groups). */ |
| private final int dfbg; |
| |
| /** Degrees of freedom in denominator (within groups). */ |
| private final int dfwg; |
| |
| /** Statistic. */ |
| private final double f; |
| |
| /** |
| * Constructor. |
| * @param dfbg degrees of freedom in numerator (between groups) |
| * @param dfwg degrees of freedom in denominator (within groups) |
| * @param f statistic |
| */ |
| private AnovaStats(int dfbg, int dfwg, double f) { |
| this.dfbg = dfbg; |
| this.dfwg = dfwg; |
| this.f = f; |
| } |
| } |
| |
| } |