commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/GTest.java - commons-statistics - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.statistics.inference;

 import org.apache.commons.numbers.core.Sum;
 import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

 /**
  * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
  *
  * <p>This is known in statistical genetics as the McDonald-Kreitman test.
  * The implementation handles both known and unknown distributions.
  *
  * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
  * but provided by one sample, or when the hypothesis under test is that the two
  * samples come from the same underlying distribution.
  *
  * @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a>
  * @since 1.1
  */
 public final class GTest {
     // Note:
     // The g-test statistic is a summation of terms with positive and negative sign
     // and thus the sum may exhibit cancellation. This class uses separate high precision
     // sums of the positive and negative terms which are then combined.
     // Total cancellation for a large number of terms will not impact
     // p-values of interest around critical alpha values as the Chi^2
     // distribution exhibits strong concentration around its mean (degrees of freedom, k).
     // The summation only need maintain enough bits in the final sum to distinguish
     // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k,
     // with k = number of terms - 1.

     /** Default instance. */
     private static final GTest DEFAULT = new GTest(0);

     /** Degrees of freedom adjustment. */
     private final int degreesOfFreedomAdjustment;

     /**
      * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
      */
     private GTest(int degreesOfFreedomAdjustment) {
         this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
     }

     /**
      * Return an instance using the default options.
      *
      * <ul>
      * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
      * </ul>
      *
      * @return default instance
      */
     public static GTest withDefaults() {
         return DEFAULT;
     }

     /**
      * Return an instance with the configured degrees of freedom adjustment.
      *
      * <p>The default degrees of freedom for a sample of length {@code n} are
      * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
      * more parameters from the data in order to get the numbers for your null
      * hypothesis. For a distribution with {@code p} parameters where up to
      * {@code p} parameters have been estimated from the data the degrees of freedom
      * is in the range {@code [n - 1 - p, n - 1]}.
      *
      * @param v Value.
      * @return an instance
      * @throws IllegalArgumentException if the value is negative
      */
     public GTest withDegreesOfFreedomAdjustment(int v) {
         return new GTest(Arguments.checkNonNegative(v));
     }

     /**
      * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to
      * a uniform expected value (each category is equally likely).
      *
      * <p>Note: This is a specialized version of a comparison of {@code observed}
      * with an {@code expected} array of uniform values. The result is faster than
      * calling {@link #statistic(double[], long[])} and the statistic is the same,
      * with an allowance for accumulated floating-point error due to the optimized
      * routine.
      *
      * @param observed Observed frequency counts.
      * @return G-test statistic
      * @throws IllegalArgumentException if the sample size is less than 2;
      * {@code observed} has negative entries; or all the observations are zero.
      * @see #test(long[])
      */
     public double statistic(long[] observed) {
         Arguments.checkValuesRequiredSize(observed.length, 2);
         Arguments.checkNonNegative(observed);
         final double e = StatisticUtils.mean(observed);
         if (e == 0) {
             throw new InferenceException(InferenceException.NO_DATA);
         }
         // g = 2 * sum{o * ln(o/e)}
         //   = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ]
         // The second form has more cancellation as the sums are larger.
         // Separate sum for positive and negative terms.
         final Sum sum = Sum.create();
         final Sum sum2 = Sum.create();
         for (final double o : observed) {
             if (o > e) {
                 // Positive term
                 sum.add(o * Math.log(o / e));
             } else if (o > 0) {
                 // Negative term
                 // Process non-zero counts to avoid 0 * -inf = NaN
                 sum2.add(o * Math.log(o / e));
             }
         }
         return sum.add(sum2).getAsDouble() * 2;
     }

     /**
      * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected}
      * frequency counts.
      *
      * <p><strong>Note:</strong>This implementation rescales the values
      * if necessary to ensure that the sum of the expected and observed counts
      * are equal.
      *
      * @param expected Expected frequency counts.
      * @param observed Observed frequency counts.
      * @return G-test statistic
      * @throws IllegalArgumentException if the sample size is less than 2; the array
      * sizes do not match; {@code expected} has entries that are not strictly
      * positive; {@code observed} has negative entries; or all the observations are zero.
      * @see #test(double[], long[])
      */
     public double statistic(double[] expected, long[] observed) {
         // g = 2 * sum{o * ln(o/e)}
         // The sum of o and e must be the same.
         final double ratio = StatisticUtils.computeRatio(expected, observed);
         // High precision sum to reduce cancellation.
         // Separate sum for positive and negative terms.
         final Sum sum = Sum.create();
         final Sum sum2 = Sum.create();
         for (int i = 0; i < observed.length; i++) {
             final long o = observed[i];
             // Process non-zero counts to avoid 0 * -inf = NaN
             if (o != 0) {
                 final double term = o * Math.log(o / (ratio * expected[i]));
                 if (term < 0) {
                     sum2.add(term);
                 } else {
                     sum.add(term);
                 }
             }
         }
         return sum.add(sum2).getAsDouble() * 2;
     }

     /**
      * Computes a G-test statistic associated with a G-test of
      * independence based on the input {@code counts} array, viewed as a two-way
      * table. The formula used to compute the test statistic is:
      *
      * <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \]
      *
      * <p>and \( H \) is the <a
      * href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
      * Shannon Entropy</a> of the random variable formed by viewing the elements of
      * the argument array as incidence counts:
      *
      * <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \]
      *
      * @param counts 2-way table.
      * @return G-test statistic
      * @throws IllegalArgumentException if the number of rows or columns is less
      * than 2; the array is non-rectangular; the array has negative entries; or the
      * sum of a row or column is zero.
      * @see ChiSquareTest#test(long[][])
      */
     public double statistic(long[][] counts) {
         Arguments.checkCategoriesRequiredSize(counts.length, 2);
         Arguments.checkValuesRequiredSize(counts[0].length, 2);
         Arguments.checkRectangular(counts);
         Arguments.checkNonNegative(counts);

         final int ni = counts.length;
         final int nj = counts[0].length;

         // Compute row, column and total sums
         final double[] sumi = new double[ni];
         final double[] sumj = new double[nj];
         double n = 0;
         // We can sum data on the first pass. See below for computation details.
         final Sum sum = Sum.create();
         for (int i = 0; i < ni; i++) {
             for (int j = 0; j < nj; j++) {
                 final long c = counts[i][j];
                 sumi[i] += c;
                 sumj[j] += c;
                 if (c > 1) {
                     sum.add(c * Math.log(c));
                 }
             }
             checkNonZero(sumi[i], "Row", i);
             n += sumi[i];
         }

         for (int j = 0; j < nj; j++) {
             checkNonZero(sumj[j], "Column", j);
         }

         // This computes a modified form of the Shannon entropy H without requiring
         // normalisation of observations to probabilities and without negation,
         // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].

         // H  = -sum (p * log(p))
         // H' = n * sum (p * log(p))
         //    = n * sum (o/n * log(o/n))
         //    = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
         //    = sum(o * log(o)) - n log(n)

         // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
         sum.addProduct(n, Math.log(n));
         // Negative terms
         final Sum sum2 = Sum.create();
         // All these counts are above zero so no check for zeros
         for (final double c : sumi) {
             sum2.add(c * -Math.log(c));
         }
         for (final double c : sumj) {
             sum2.add(c * -Math.log(c));
         }

         return sum.add(sum2).getAsDouble() * 2;
     }

     /**
      * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
      * counts conform to a uniform distribution (each category is equally likely).
      *
      * @param observed Observed frequency counts.
      * @return test result
      * @throws IllegalArgumentException if the sample size is less than 2;
      * {@code observed} has negative entries; or all the observations are zero
      * @see #statistic(long[])
      */
     public SignificanceResult test(long[] observed) {
         final int df = observed.length - 1;
         final double g = statistic(observed);
         final double p = computeP(g, df);
         return new BaseSignificanceResult(g, p);
     }

     /**
      * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
      * counts conform to the {@code expected} counts.
      *
      * <p>The test can be configured to apply an adjustment to the degrees of freedom
      * if the observed data has been used to create the expected counts.
      *
      * @param expected Expected frequency counts.
      * @param observed Observed frequency counts.
      * @return test result
      * @throws IllegalArgumentException if the sample size is less than 2; the array
      * sizes do not match; {@code expected} has entries that are not strictly
      * positive; {@code observed} has negative entries; all the observations are zero; or
      * the adjusted degrees of freedom are not strictly positive
      * @see #withDegreesOfFreedomAdjustment(int)
      * @see #statistic(double[], long[])
      */
     public SignificanceResult test(double[] expected, long[] observed) {
         final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
         final double g = statistic(expected, observed);
         final double p = computeP(g, df);
         return new BaseSignificanceResult(g, p);
     }

     /**
      * Perform a G-test of independence based on the input
      * {@code counts} array, viewed as a two-way table.
      *
      * @param counts 2-way table.
      * @return test result
      * @throws IllegalArgumentException if the number of rows or columns is less
      * than 2; the array is non-rectangular; the array has negative entries; or the
      * sum of a row or column is zero.
      * @see #statistic(long[][])
      */
     public SignificanceResult test(long[][] counts) {
         final double g = statistic(counts);
         final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
         final double p = computeP(g, df);
         return new BaseSignificanceResult(g, p);
     }

     /**
      * Compute the G-test p-value.
      *
      * @param g G-test statistic.
      * @param degreesOfFreedom Degrees of freedom.
      * @return p-value
      */
     private static double computeP(double g, double degreesOfFreedom) {
         return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g);
     }

     /**
      * Check the array value is non-zero.
      *
      * @param value Value
      * @param name Name of the array
      * @param index Index in the array
      * @throws IllegalArgumentException if the value is zero
      */
     private static void checkNonZero(double value, String name, int index) {
         if (value == 0) {
             throw new InferenceException(InferenceException.ZERO_AT, name, index);
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.statistics.inference;

	import org.apache.commons.numbers.core.Sum;
	import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

	/**
	* Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
	*
	* <p>This is known in statistical genetics as the McDonald-Kreitman test.
	* The implementation handles both known and unknown distributions.
	*
	* <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
	* but provided by one sample, or when the hypothesis under test is that the two
	* samples come from the same underlying distribution.
	*
	* @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a>
	* @since 1.1
	*/
	public final class GTest {
	// Note:
	// The g-test statistic is a summation of terms with positive and negative sign
	// and thus the sum may exhibit cancellation. This class uses separate high precision
	// sums of the positive and negative terms which are then combined.
	// Total cancellation for a large number of terms will not impact
	// p-values of interest around critical alpha values as the Chi^2
	// distribution exhibits strong concentration around its mean (degrees of freedom, k).
	// The summation only need maintain enough bits in the final sum to distinguish
	// g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k,
	// with k = number of terms - 1.

	/** Default instance. */
	private static final GTest DEFAULT = new GTest(0);

	/** Degrees of freedom adjustment. */
	private final int degreesOfFreedomAdjustment;

	/**
	* @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
	*/
	private GTest(int degreesOfFreedomAdjustment) {
	this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
	}

	/**
	* Return an instance using the default options.
	*
	* <ul>
	* <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
	* </ul>
	*
	* @return default instance
	*/
	public static GTest withDefaults() {
	return DEFAULT;
	}

	/**
	* Return an instance with the configured degrees of freedom adjustment.
	*
	* <p>The default degrees of freedom for a sample of length {@code n} are
	* {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
	* more parameters from the data in order to get the numbers for your null
	* hypothesis. For a distribution with {@code p} parameters where up to
	* {@code p} parameters have been estimated from the data the degrees of freedom
	* is in the range {@code [n - 1 - p, n - 1]}.
	*
	* @param v Value.
	* @return an instance
	* @throws IllegalArgumentException if the value is negative
	*/
	public GTest withDegreesOfFreedomAdjustment(int v) {
	return new GTest(Arguments.checkNonNegative(v));
	}

	/**
	* Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to
	* a uniform expected value (each category is equally likely).
	*
	* <p>Note: This is a specialized version of a comparison of {@code observed}
	* with an {@code expected} array of uniform values. The result is faster than
	* calling {@link #statistic(double[], long[])} and the statistic is the same,
	* with an allowance for accumulated floating-point error due to the optimized
	* routine.
	*
	* @param observed Observed frequency counts.
	* @return G-test statistic
	* @throws IllegalArgumentException if the sample size is less than 2;
	* {@code observed} has negative entries; or all the observations are zero.
	* @see #test(long[])
	*/
	public double statistic(long[] observed) {
	Arguments.checkValuesRequiredSize(observed.length, 2);
	Arguments.checkNonNegative(observed);
	final double e = StatisticUtils.mean(observed);
	if (e == 0) {
	throw new InferenceException(InferenceException.NO_DATA);
	}
	// g = 2 * sum{o * ln(o/e)}
	// = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ]
	// The second form has more cancellation as the sums are larger.
	// Separate sum for positive and negative terms.
	final Sum sum = Sum.create();
	final Sum sum2 = Sum.create();
	for (final double o : observed) {
	if (o > e) {
	// Positive term
	sum.add(o * Math.log(o / e));
	} else if (o > 0) {
	// Negative term
	// Process non-zero counts to avoid 0 * -inf = NaN
	sum2.add(o * Math.log(o / e));
	}
	}
	return sum.add(sum2).getAsDouble() * 2;
	}

	/**
	* Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected}
	* frequency counts.
	*
	* <p><strong>Note:</strong>This implementation rescales the values
	* if necessary to ensure that the sum of the expected and observed counts
	* are equal.
	*
	* @param expected Expected frequency counts.
	* @param observed Observed frequency counts.
	* @return G-test statistic
	* @throws IllegalArgumentException if the sample size is less than 2; the array
	* sizes do not match; {@code expected} has entries that are not strictly
	* positive; {@code observed} has negative entries; or all the observations are zero.
	* @see #test(double[], long[])
	*/
	public double statistic(double[] expected, long[] observed) {
	// g = 2 * sum{o * ln(o/e)}
	// The sum of o and e must be the same.
	final double ratio = StatisticUtils.computeRatio(expected, observed);
	// High precision sum to reduce cancellation.
	// Separate sum for positive and negative terms.
	final Sum sum = Sum.create();
	final Sum sum2 = Sum.create();
	for (int i = 0; i < observed.length; i++) {
	final long o = observed[i];
	// Process non-zero counts to avoid 0 * -inf = NaN
	if (o != 0) {
	final double term = o * Math.log(o / (ratio * expected[i]));
	if (term < 0) {
	sum2.add(term);
	} else {
	sum.add(term);
	}
	}
	}
	return sum.add(sum2).getAsDouble() * 2;
	}

	/**
	* Computes a G-test statistic associated with a G-test of
	* independence based on the input {@code counts} array, viewed as a two-way
	* table. The formula used to compute the test statistic is:
	*
	* <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \]
	*
	* <p>and \( H \) is the <a
	* href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
	* Shannon Entropy</a> of the random variable formed by viewing the elements of
	* the argument array as incidence counts:
	*
	* <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \]
	*
	* @param counts 2-way table.
	* @return G-test statistic
	* @throws IllegalArgumentException if the number of rows or columns is less
	* than 2; the array is non-rectangular; the array has negative entries; or the
	* sum of a row or column is zero.
	* @see ChiSquareTest#test(long[][])
	*/
	public double statistic(long[][] counts) {
	Arguments.checkCategoriesRequiredSize(counts.length, 2);
	Arguments.checkValuesRequiredSize(counts[0].length, 2);
	Arguments.checkRectangular(counts);
	Arguments.checkNonNegative(counts);

	final int ni = counts.length;
	final int nj = counts[0].length;

	// Compute row, column and total sums
	final double[] sumi = new double[ni];
	final double[] sumj = new double[nj];
	double n = 0;
	// We can sum data on the first pass. See below for computation details.
	final Sum sum = Sum.create();
	for (int i = 0; i < ni; i++) {
	for (int j = 0; j < nj; j++) {
	final long c = counts[i][j];
	sumi[i] += c;
	sumj[j] += c;
	if (c > 1) {
	sum.add(c * Math.log(c));
	}
	}
	checkNonZero(sumi[i], "Row", i);
	n += sumi[i];
	}

	for (int j = 0; j < nj; j++) {
	checkNonZero(sumj[j], "Column", j);
	}

	// This computes a modified form of the Shannon entropy H without requiring
	// normalisation of observations to probabilities and without negation,
	// i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].

	// H = -sum (p * log(p))
	// H' = n * sum (p * log(p))
	// = n * sum (o/n * log(o/n))
	// = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
	// = sum(o * log(o)) - n log(n)

	// After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
	sum.addProduct(n, Math.log(n));
	// Negative terms
	final Sum sum2 = Sum.create();
	// All these counts are above zero so no check for zeros
	for (final double c : sumi) {
	sum2.add(c * -Math.log(c));
	}
	for (final double c : sumj) {
	sum2.add(c * -Math.log(c));
	}

	return sum.add(sum2).getAsDouble() * 2;
	}

	/**
	* Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
	* counts conform to a uniform distribution (each category is equally likely).
	*
	* @param observed Observed frequency counts.
	* @return test result
	* @throws IllegalArgumentException if the sample size is less than 2;
	* {@code observed} has negative entries; or all the observations are zero
	* @see #statistic(long[])
	*/
	public SignificanceResult test(long[] observed) {
	final int df = observed.length - 1;
	final double g = statistic(observed);
	final double p = computeP(g, df);
	return new BaseSignificanceResult(g, p);
	}

	/**
	* Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
	* counts conform to the {@code expected} counts.
	*
	* <p>The test can be configured to apply an adjustment to the degrees of freedom
	* if the observed data has been used to create the expected counts.
	*
	* @param expected Expected frequency counts.
	* @param observed Observed frequency counts.
	* @return test result
	* @throws IllegalArgumentException if the sample size is less than 2; the array
	* sizes do not match; {@code expected} has entries that are not strictly
	* positive; {@code observed} has negative entries; all the observations are zero; or
	* the adjusted degrees of freedom are not strictly positive
	* @see #withDegreesOfFreedomAdjustment(int)
	* @see #statistic(double[], long[])
	*/
	public SignificanceResult test(double[] expected, long[] observed) {
	final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
	final double g = statistic(expected, observed);
	final double p = computeP(g, df);
	return new BaseSignificanceResult(g, p);
	}

	/**
	* Perform a G-test of independence based on the input
	* {@code counts} array, viewed as a two-way table.
	*
	* @param counts 2-way table.
	* @return test result
	* @throws IllegalArgumentException if the number of rows or columns is less
	* than 2; the array is non-rectangular; the array has negative entries; or the
	* sum of a row or column is zero.
	* @see #statistic(long[][])
	*/
	public SignificanceResult test(long[][] counts) {
	final double g = statistic(counts);
	final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
	final double p = computeP(g, df);
	return new BaseSignificanceResult(g, p);
	}

	/**
	* Compute the G-test p-value.
	*
	* @param g G-test statistic.
	* @param degreesOfFreedom Degrees of freedom.
	* @return p-value
	*/
	private static double computeP(double g, double degreesOfFreedom) {
	return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g);
	}

	/**
	* Check the array value is non-zero.
	*
	* @param value Value
	* @param name Name of the array
	* @param index Index in the array
	* @throws IllegalArgumentException if the value is zero
	*/
	private static void checkNonZero(double value, String name, int index) {
	if (value == 0) {
	throw new InferenceException(InferenceException.ZERO_AT, name, index);
	}
	}
	}