| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.statistics.inference; |
| |
| import java.util.Objects; |
| import org.apache.commons.statistics.distribution.HypergeometricDistribution; |
| |
| /** |
| * Implements Fisher's exact test. |
| * |
| * <p>Performs an exact test for the statistical significance of the association (contingency) |
| * between two kinds of categorical classification. |
| * |
| * <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance |
| * and not random. |
| * |
| * @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher's exact test (Wikipedia)</a> |
| * @since 1.1 |
| */ |
| public final class FisherExactTest { |
| /** Default instance. */ |
| private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED); |
| |
| /** Alternative hypothesis. */ |
| private final AlternativeHypothesis alternative; |
| |
| /** |
| * @param alternative Alternative hypothesis. |
| */ |
| private FisherExactTest(AlternativeHypothesis alternative) { |
| this.alternative = alternative; |
| } |
| |
| /** |
| * Return an instance using the default options. |
| * |
| * <ul> |
| * <li>{@link AlternativeHypothesis#TWO_SIDED} |
| * </ul> |
| * |
| * @return default instance |
| */ |
| public static FisherExactTest withDefaults() { |
| return DEFAULT; |
| } |
| |
| /** |
| * Return an instance with the configured alternative hypothesis. |
| * |
| * @param v Value. |
| * @return an instance |
| */ |
| public FisherExactTest with(AlternativeHypothesis v) { |
| return new FisherExactTest(Objects.requireNonNull(v)); |
| } |
| |
| /** |
| * Compute the prior odds ratio for the 2-by-2 contingency table. This is the |
| * "sample" or "unconditional" maximum likelihood estimate. For a table of: |
| * |
| * <p>\[ \left[ {\begin{array}{cc} |
| * a & b \\ |
| * c & d \\ |
| * \end{array} } \right] \] |
| * |
| * <p>this is: |
| * |
| * <p>\[ r = \frac{a d}{b c} \] |
| * |
| * <p>Special cases: |
| * <ul> |
| * <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}. |
| * <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}. |
| * </ul> |
| * |
| * <p>Note: This statistic is equal to the statistic computed by the SciPy function |
| * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum |
| * likelihood estimate computed by R function {@code fisher.test}. |
| * |
| * @param table 2-by-2 contingency table. |
| * @return odds ratio |
| * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any |
| * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. |
| * @see #test(int[][]) |
| */ |
| public double statistic(int[][] table) { |
| Arguments.checkTable(table); |
| final double a = table[0][0]; |
| final double b = table[0][1]; |
| final double c = table[1][0]; |
| final double d = table[1][1]; |
| return (a * d) / (b * c); |
| } |
| |
| /** |
| * Performs Fisher's exact test on the 2-by-2 contingency table. |
| * |
| * <p>The test statistic is equal to the prior odds ratio. This is the |
| * "sample" or "unconditional" maximum likelihood estimate. |
| * |
| * <p>The test is defined by the {@link AlternativeHypothesis}. |
| * |
| * <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned |
| * with the same marginals (row and column totals). In this case the possible values {@code x} |
| * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}. |
| * <ul> |
| * <li>'two-sided': the odds ratio of the underlying population is not one; the p-value |
| * is the probability that a random table has probability equal to or less than the input table. |
| * <li>'greater': the odds ratio of the underlying population is greater than one; the p-value |
| * is the probability that a random table has {@code x >= a}. |
| * <li>'less': the odds ratio of the underlying population is less than one; the p-value |
| * is the probability that a random table has {@code x <= a}. |
| * </ul> |
| * |
| * @param table 2-by-2 contingency table. |
| * @return test result |
| * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any |
| * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. |
| * @see #with(AlternativeHypothesis) |
| * @see #statistic(int[][]) |
| */ |
| public SignificanceResult test(int[][] table) { |
| Arguments.checkTable(table); |
| final int a = table[0][0]; |
| final int b = table[0][1]; |
| final int c = table[1][0]; |
| final int d = table[1][1]; |
| |
| // Odd-ratio. |
| final double statistic = ((double) a * d) / ((double) b * c); |
| |
| final int nn = a + b + c + d; |
| final int k = a + b; |
| final int n = a + c; |
| |
| // Note: The distribution validates the population size is > 0 |
| final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n); |
| double p; |
| if (alternative == AlternativeHypothesis.GREATER_THAN) { |
| p = distribution.survivalProbability(a - 1); |
| } else if (alternative == AlternativeHypothesis.LESS_THAN) { |
| p = distribution.cumulativeProbability(a); |
| } else { |
| p = twoSidedTest(a, distribution); |
| } |
| return new BaseSignificanceResult(statistic, p); |
| } |
| |
| /** |
| * Returns the <i>observed significance level</i>, or p-value, associated with a |
| * two-sided test about the observed value. |
| * |
| * @param k Observed value. |
| * @param distribution Hypergeometric distribution. |
| * @return p-value |
| */ |
| private static double twoSidedTest(int k, HypergeometricDistribution distribution) { |
| // Find all i where Pr(X = i) <= Pr(X = k) and sum them. |
| // Exploit the known unimodal distribution to increase the |
| // search speed. Note the search depends only on magnitude differences. |
| // The current HypergeometricDistribution is faster using log probability |
| // as it omits a call to Math.exp. |
| |
| // Use the mode as the point of largest probability. |
| // The lower or upper mode is important for the search below. |
| final int nn = distribution.getPopulationSize(); |
| final int kk = distribution.getNumberOfSuccesses(); |
| final int n = distribution.getSampleSize(); |
| final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0); |
| final int m1 = (int) Math.ceil(v) - 1; |
| final int m2 = (int) Math.floor(v); |
| if (k < m1) { |
| final double pk = distribution.logProbability(k); |
| // Lower half = cdf(k) |
| // Find upper half. As k < lower mode i should never |
| // reach the lower mode based on the probability alone. |
| // Bracket with the upper mode. |
| final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk, |
| distribution::logProbability); |
| return distribution.cumulativeProbability(k) + |
| distribution.survivalProbability(i - 1); |
| } else if (k > m2) { |
| final double pk = distribution.logProbability(k); |
| // Upper half = sf(k - 1) |
| // Find lower half. As k > upper mode i should never |
| // reach the upper mode based on the probability alone. |
| // Bracket with the lower mode. |
| final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk, |
| distribution::logProbability); |
| return distribution.cumulativeProbability(i) + |
| distribution.survivalProbability(k - 1); |
| } |
| // k == mode |
| // Edge case where the sum of probabilities will be either |
| // 1 or 1 - Pr(X = mode) where mode != k |
| final double pk = distribution.probability(k); |
| final double pm = distribution.probability(k == m1 ? m2 : m1); |
| return pm > pk ? 1 - pm : 1; |
| } |
| } |