commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/FisherExactTest.java - commons-statistics - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.statistics.inference;

 import java.util.Objects;
 import org.apache.commons.statistics.distribution.HypergeometricDistribution;

 /**
  * Implements Fisher's exact test.
  *
  * <p>Performs an exact test for the statistical significance of the association (contingency)
  * between two kinds of categorical classification.
  *
  * <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance
  * and not random.
  *
  * @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher&#39;s exact test (Wikipedia)</a>
  * @since 1.1
  */
 public final class FisherExactTest {
     /** Default instance. */
     private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED);

     /** Alternative hypothesis. */
     private final AlternativeHypothesis alternative;

     /**
      * @param alternative Alternative hypothesis.
      */
     private FisherExactTest(AlternativeHypothesis alternative) {
         this.alternative = alternative;
     }

     /**
      * Return an instance using the default options.
      *
      * <ul>
      * <li>{@link AlternativeHypothesis#TWO_SIDED}
      * </ul>
      *
      * @return default instance
      */
     public static FisherExactTest withDefaults() {
         return DEFAULT;
     }

     /**
      * Return an instance with the configured alternative hypothesis.
      *
      * @param v Value.
      * @return an instance
      */
     public FisherExactTest with(AlternativeHypothesis v) {
         return new FisherExactTest(Objects.requireNonNull(v));
     }

     /**
      * Compute the prior odds ratio for the 2-by-2 contingency table. This is the
      * "sample" or "unconditional" maximum likelihood estimate. For a table of:
      *
      * <p>\[ \left[ {\begin{array}{cc}
      *         a &amp; b \\
      *         c &amp; d \\
      *       \end{array} } \right] \]
      *
      * <p>this is:
      *
      * <p>\[ r = \frac{a d}{b c} \]
      *
      * <p>Special cases:
      * <ul>
      * <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}.
      * <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}.
      * </ul>
      *
      * <p>Note: This statistic is equal to the statistic computed by the SciPy function
      * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum
      * likelihood estimate computed by R function {@code fisher.test}.
      *
      * @param table 2-by-2 contingency table.
      * @return odds ratio
      * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
      * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
      * @see #test(int[][])
      */
     public double statistic(int[][] table) {
         Arguments.checkTable(table);
         final double a = table[0][0];
         final double b = table[0][1];
         final double c = table[1][0];
         final double d = table[1][1];
         return (a * d) / (b * c);
     }

     /**
      * Performs Fisher's exact test on the 2-by-2 contingency table.
      *
      * <p>The test statistic is equal to the prior odds ratio. This is the
      * "sample" or "unconditional" maximum likelihood estimate.
      *
      * <p>The test is defined by the {@link AlternativeHypothesis}.
      *
      * <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned
      * with the same marginals (row and column totals). In this case the possible values {@code x}
      * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}.
      * <ul>
      * <li>'two-sided': the odds ratio of the underlying population is not one; the p-value
      * is the probability that a random table has probability equal to or less than the input table.
      * <li>'greater': the odds ratio of the underlying population is greater than one; the p-value
      * is the probability that a random table has {@code x >= a}.
      * <li>'less': the odds ratio of the underlying population is less than one; the p-value
      * is the probability that a random table has {@code x <= a}.
      * </ul>
      *
      * @param table 2-by-2 contingency table.
      * @return test result
      * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
      * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
      * @see #with(AlternativeHypothesis)
      * @see #statistic(int[][])
      */
     public SignificanceResult test(int[][] table) {
         Arguments.checkTable(table);
         final int a = table[0][0];
         final int b = table[0][1];
         final int c = table[1][0];
         final int d = table[1][1];

         // Odd-ratio.
         final double statistic = ((double) a * d) / ((double) b * c);

         final int nn = a + b + c + d;
         final int k = a + b;
         final int n = a + c;

         // Note: The distribution validates the population size is > 0
         final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n);
         double p;
         if (alternative == AlternativeHypothesis.GREATER_THAN) {
             p = distribution.survivalProbability(a - 1);
         } else if (alternative == AlternativeHypothesis.LESS_THAN) {
             p = distribution.cumulativeProbability(a);
         } else {
             p = twoSidedTest(a, distribution);
         }
         return new BaseSignificanceResult(statistic, p);
     }

     /**
      * Returns the <i>observed significance level</i>, or p-value, associated with a
      * two-sided test about the observed value.
      *
      * @param k Observed value.
      * @param distribution Hypergeometric distribution.
      * @return p-value
      */
     private static double twoSidedTest(int k, HypergeometricDistribution distribution) {
         // Find all i where Pr(X = i) <= Pr(X = k) and sum them.
         // Exploit the known unimodal distribution to increase the
         // search speed. Note the search depends only on magnitude differences.
         // The current HypergeometricDistribution is faster using log probability
         // as it omits a call to Math.exp.

         // Use the mode as the point of largest probability.
         // The lower or upper mode is important for the search below.
         final int nn = distribution.getPopulationSize();
         final int kk = distribution.getNumberOfSuccesses();
         final int n = distribution.getSampleSize();
         final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0);
         final int m1 = (int) Math.ceil(v) - 1;
         final int m2 = (int) Math.floor(v);
         if (k < m1) {
             final double pk = distribution.logProbability(k);
             // Lower half = cdf(k)
             // Find upper half. As k < lower mode i should never
             // reach the lower mode based on the probability alone.
             // Bracket with the upper mode.
             final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk,
                 distribution::logProbability);
             return distribution.cumulativeProbability(k) +
                    distribution.survivalProbability(i - 1);
         } else if (k > m2) {
             final double pk = distribution.logProbability(k);
             // Upper half = sf(k - 1)
             // Find lower half. As k > upper mode i should never
             // reach the upper mode based on the probability alone.
             // Bracket with the lower mode.
             final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk,
                 distribution::logProbability);
             return distribution.cumulativeProbability(i) +
                    distribution.survivalProbability(k - 1);
         }
         // k == mode
         // Edge case where the sum of probabilities will be either
         // 1 or 1 - Pr(X = mode) where mode != k
         final double pk = distribution.probability(k);
         final double pm = distribution.probability(k == m1 ? m2 : m1);
         return pm > pk ? 1 - pm : 1;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.statistics.inference;

	import java.util.Objects;
	import org.apache.commons.statistics.distribution.HypergeometricDistribution;

	/**
	* Implements Fisher's exact test.
	*
	* <p>Performs an exact test for the statistical significance of the association (contingency)
	* between two kinds of categorical classification.
	*
	* <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance
	* and not random.
	*
	* @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher's exact test (Wikipedia)</a>
	* @since 1.1
	*/
	public final class FisherExactTest {
	/** Default instance. */
	private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED);

	/** Alternative hypothesis. */
	private final AlternativeHypothesis alternative;

	/**
	* @param alternative Alternative hypothesis.
	*/
	private FisherExactTest(AlternativeHypothesis alternative) {
	this.alternative = alternative;
	}

	/**
	* Return an instance using the default options.
	*
	* <ul>
	* <li>{@link AlternativeHypothesis#TWO_SIDED}
	* </ul>
	*
	* @return default instance
	*/
	public static FisherExactTest withDefaults() {
	return DEFAULT;
	}

	/**
	* Return an instance with the configured alternative hypothesis.
	*
	* @param v Value.
	* @return an instance
	*/
	public FisherExactTest with(AlternativeHypothesis v) {
	return new FisherExactTest(Objects.requireNonNull(v));
	}

	/**
	* Compute the prior odds ratio for the 2-by-2 contingency table. This is the
	* "sample" or "unconditional" maximum likelihood estimate. For a table of:
	*
	* <p>\[ \left[ {\begin{array}{cc}
	* a & b \\
	* c & d \\
	* \end{array} } \right] \]
	*
	* <p>this is:
	*
	* <p>\[ r = \frac{a d}{b c} \]
	*
	* <p>Special cases:
	* <ul>
	* <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}.
	* <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}.
	* </ul>
	*
	* <p>Note: This statistic is equal to the statistic computed by the SciPy function
	* {@code scipy.stats.fisher_exact}. It is different to the conditional maximum
	* likelihood estimate computed by R function {@code fisher.test}.
	*
	* @param table 2-by-2 contingency table.
	* @return odds ratio
	* @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
	* table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
	* @see #test(int[][])
	*/
	public double statistic(int[][] table) {
	Arguments.checkTable(table);
	final double a = table[0][0];
	final double b = table[0][1];
	final double c = table[1][0];
	final double d = table[1][1];
	return (a * d) / (b * c);
	}

	/**
	* Performs Fisher's exact test on the 2-by-2 contingency table.
	*
	* <p>The test statistic is equal to the prior odds ratio. This is the
	* "sample" or "unconditional" maximum likelihood estimate.
	*
	* <p>The test is defined by the {@link AlternativeHypothesis}.
	*
	* <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned
	* with the same marginals (row and column totals). In this case the possible values {@code x}
	* of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}.
	* <ul>
	* <li>'two-sided': the odds ratio of the underlying population is not one; the p-value
	* is the probability that a random table has probability equal to or less than the input table.
	* <li>'greater': the odds ratio of the underlying population is greater than one; the p-value
	* is the probability that a random table has {@code x >= a}.
	* <li>'less': the odds ratio of the underlying population is less than one; the p-value
	* is the probability that a random table has {@code x <= a}.
	* </ul>
	*
	* @param table 2-by-2 contingency table.
	* @return test result
	* @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any
	* table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer.
	* @see #with(AlternativeHypothesis)
	* @see #statistic(int[][])
	*/
	public SignificanceResult test(int[][] table) {
	Arguments.checkTable(table);
	final int a = table[0][0];
	final int b = table[0][1];
	final int c = table[1][0];
	final int d = table[1][1];

	// Odd-ratio.
	final double statistic = ((double) a * d) / ((double) b * c);

	final int nn = a + b + c + d;
	final int k = a + b;
	final int n = a + c;

	// Note: The distribution validates the population size is > 0
	final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n);
	double p;
	if (alternative == AlternativeHypothesis.GREATER_THAN) {
	p = distribution.survivalProbability(a - 1);
	} else if (alternative == AlternativeHypothesis.LESS_THAN) {
	p = distribution.cumulativeProbability(a);
	} else {
	p = twoSidedTest(a, distribution);
	}
	return new BaseSignificanceResult(statistic, p);
	}

	/**
	* Returns the <i>observed significance level</i>, or p-value, associated with a
	* two-sided test about the observed value.
	*
	* @param k Observed value.
	* @param distribution Hypergeometric distribution.
	* @return p-value
	*/
	private static double twoSidedTest(int k, HypergeometricDistribution distribution) {
	// Find all i where Pr(X = i) <= Pr(X = k) and sum them.
	// Exploit the known unimodal distribution to increase the
	// search speed. Note the search depends only on magnitude differences.
	// The current HypergeometricDistribution is faster using log probability
	// as it omits a call to Math.exp.

	// Use the mode as the point of largest probability.
	// The lower or upper mode is important for the search below.
	final int nn = distribution.getPopulationSize();
	final int kk = distribution.getNumberOfSuccesses();
	final int n = distribution.getSampleSize();
	final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0);
	final int m1 = (int) Math.ceil(v) - 1;
	final int m2 = (int) Math.floor(v);
	if (k < m1) {
	final double pk = distribution.logProbability(k);
	// Lower half = cdf(k)
	// Find upper half. As k < lower mode i should never
	// reach the lower mode based on the probability alone.
	// Bracket with the upper mode.
	final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk,
	distribution::logProbability);
	return distribution.cumulativeProbability(k) +
	distribution.survivalProbability(i - 1);
	} else if (k > m2) {
	final double pk = distribution.logProbability(k);
	// Upper half = sf(k - 1)
	// Find lower half. As k > upper mode i should never
	// reach the upper mode based on the probability alone.
	// Bracket with the lower mode.
	final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk,
	distribution::logProbability);
	return distribution.cumulativeProbability(i) +
	distribution.survivalProbability(k - 1);
	}
	// k == mode
	// Edge case where the sum of probabilities will be either
	// 1 or 1 - Pr(X = mode) where mode != k
	final double pk = distribution.probability(k);
	final double pm = distribution.probability(k == m1 ? m2 : m1);
	return pm > pk ? 1 - pm : 1;
	}
	}