commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/StatisticUtils.java - commons-statistics - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.statistics.inference;

 import java.util.Arrays;
 import java.util.Collection;
 import java.util.stream.IntStream;
 import org.apache.commons.numbers.core.Precision;

 /**
  * Utility computation methods.
  *
  * @since 1.1
  */
 final class StatisticUtils {
     /** No instances. */
     private StatisticUtils() {}

     /**
      * Compute {@code x - y}.
      *
      * <p>If {@code y} is zero the original array is returned, else a new array is created
      * with the difference.
      *
      * @param x Array.
      * @param y Value.
      * @return x - y
      * @throws NullPointerException if {@code x} is null and {@code y} is non-zero
      */
     static double[] subtract(double[] x, double y) {
         return y == 0 ? x : Arrays.stream(x).map(v -> v - y).toArray();
     }

     /**
      * Compute the degrees of freedom as {@code n - 1 - m}.
      *
      * <p>This method is common functionality shared between the Chi-square test and
      * G-test. The pre-conditions for those tests are performed by this method.
      *
      * @param n Number of observations.
      * @param m Adjustment (assumed to be positive).
      * @return the degrees of freedom
      * @throws IllegalArgumentException if the degrees of freedom is not strictly positive
      */
     static int computeDegreesOfFreedom(int n, int m) {
         final int df = n - 1 - m;
         if (df <= 0) {
             throw new InferenceException("Invalid degrees of freedom: " + df);
         }
         return df;
     }

     /**
      * Gets the ratio between the sum of the observed and expected values.
      * The ratio can be used to scale the expected values to have the same sum
      * as the observed values:
      *
      * <pre>
      * sum(o) = sum(e * ratio)
      * </pre>
      *
      * <p>This method is common functionality shared between the Chi-square test and
      * G-test. The pre-conditions for those tests are performed by this method.
      *
      * @param expected Expected values.
      * @param observed Observed values.
      * @return the ratio
      * @throws IllegalArgumentException if the sample size is less than 2; the array
      * sizes do not match; {@code expected} has entries that are not strictly
      * positive; {@code observed} has negative entries; or all the observations are zero.
      */
     static double computeRatio(double[] expected, long[] observed) {
         Arguments.checkValuesRequiredSize(expected.length, 2);
         Arguments.checkValuesSizeMatch(expected.length, observed.length);
         Arguments.checkStrictlyPositive(expected);
         Arguments.checkNonNegative(observed);
         final DD e = DD.create();
         final DD o = DD.create();
         for (int i = 0; i < observed.length; i++) {
             DD.fastAdd(e.hi(), e.lo(), expected[i], e);
             add(o, observed[i]);
         }
         if (o.doubleValue() == 0) {
             throw new InferenceException(InferenceException.NO_DATA);
         }
         // sum(o) / sum(e)
         final double ratio = DD.divide(o.hi(), o.lo(), e.hi(), e.lo(), e).doubleValue();
         // Allow a sum within 1 ulp of 1.0
         return Precision.equals(ratio, 1.0, 0) ? 1.0 : ratio;
     }

     /**
      * Adds the value to the sum.
      *
      * @param sum Sum.
      * @param v Value.
      */
     private static void add(DD sum, long v) {
         // Split into hi and lo parts so the high part has 53-bits
         final double hi = v;
         final long lo = v - (long) hi;
         // The condition here is a high probability branch if the sample is
         // frequency counts which are typically in the 32-bit integer range.
         if (lo == 0) {
             DD.fastAdd(sum.hi(), sum.lo(), hi, sum);
         } else {
             DD.fastAdd(sum.hi(), sum.lo(), hi, lo, sum);
         }
     }

     /**
      * Returns the arithmetic mean of the entries in the input array,
      * or {@code NaN} if the array is empty.
      *
      * @param x Values.
      * @return the mean of the values or NaN if length = 0
      */
     static double mean(long[] x) {
         final int n = x.length;
         if (n == 0) {
             return Double.NaN;
         }

         // Single pass high accuracy sum. The total cannot be more than 2^63 * 2^31 bits
         // so can be exactly represented in a double-double. Cumulative error in the sum
         // is (n-1) * 4eps with eps = 2^-106. The sum should be exact to double precision.
         final DD dd = DD.create();
         for (final long v : x) {
             add(dd, v);
         }

         return DD.divide(dd.hi(), dd.lo(), n, 0, dd).doubleValue();
     }

     /**
      * Returns the arithmetic mean of the entries in the input array,
      * or {@code NaN} if the array is empty.
      *
      * <p>A two-pass, corrected algorithm is used, starting with the definitional formula
      * computed using the array of stored values and then correcting this by adding the
      * mean deviation of the data values from the arithmetic mean. See, e.g. "Comparison
      * of Several Algorithms for Computing Sample Means and Variances," Robert F. Ling,
      * Journal of the American Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp.
      * 859-866.
      *
      * @param x Values.
      * @return the mean of the values or NaN if length = 0
      */
     static double mean(double[] x) {
         final int n = x.length;
         // No check for n == 0 -> return NaN.
         // This internal method is only called with non-zero length arrays.
         // The divide by zero creates NaN anyway.

         // Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Mean
         // Updated to use a stream to support high-precision summation as the stream maintains
         // a rounding-error term during the aggregation. This is important
         // when summing differences which can create cancellation: x + -x => 0.

         // Compute initial estimate using definitional formula
         final double mean = Arrays.stream(x).sum() / n;

         // Compute correction factor in second pass
         return mean + Arrays.stream(x).map(v -> v - mean).sum() / n;
     }

     /**
      * Returns the arithmetic mean of the entries in the input arrays,
      * or {@code NaN} if the combined length of the arrays is zero.
      *
      * <p>This is the equivalent of using {@link #mean(double[])} with all the samples
      * concatenated into a single array. Supports a combined length above the maximum array
      * size.
      *
      * @param samples Values.
      * @return the mean of the values or NaN if length = 0
      * @see #mean(double[])
      */
     static double mean(Collection<double[]> samples) {
         // See above for computation details
         final long n = samples.stream().mapToInt(x -> x.length).sum();
         final double mean = samples.stream().flatMapToDouble(Arrays::stream).sum() / n;
         return mean + samples.stream().flatMapToDouble(Arrays::stream).map(v -> v - mean).sum() / n;
     }

     /**
      * Returns the variance of the entries in the input array, or {@code NaN} if the array
      * is empty.
      *
      * <p>This method returns the bias-corrected sample variance (using {@code n - 1} in
      * the denominator).
      *
      * <p>Uses a two-pass algorithm. Specifically, these methods use the "corrected
      * two-pass algorithm" from Chan, Golub, Levesque, <i>Algorithms for Computing the
      * Sample Variance</i>, American Statistician, vol. 37, no. 3 (1983) pp.
      * 242-247.
      *
      * <p>Returns 0 for a single-value (i.e. length = 1) sample.
      *
      * @param x Values.
      * @param mean the mean of the input array
      * @return the variance of the values or NaN if the array is empty
      */
     static double variance(double[] x, double mean) {
         final int n = x.length;
         // No check for n == 0 -> return NaN.
         // This internal method is only called with non-zero length arrays.
         // The input mean of NaN for zero length creates NaN anyway.
         if (n == 1) {
             return 0;
         }

         // Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Variance
         // Use a stream to accumulate the sum of deviations in high precision.
         // This compensation term for the sum of deviations from the mean -> 0.
         // We sum the squares in standard precision as there is no cancellation of summands.
         final double[] sumSq = {0};
         final double sum2 = Arrays.stream(x).map(v -> {
             final double dx = v - mean;
             sumSq[0] += dx * dx;
             return dx;
         }).sum();

         final double sum1 = sumSq[0];
         // Bias corrected
         // Note: variance ~ sum1 / (n-1) but with a correction term sum2
         return (sum1 - (sum2 * sum2 / n)) / (n - 1);
     }

     /**
      * Returns the mean of the (signed) differences between corresponding elements of the
      * input arrays.
      *
      * <pre>
      * sum(x[i] - y[i]) / x.length
      * </pre>
      *
      * <p>This computes the same result as creating an array {@code z = x - y}
      * and calling {@link #mean(double[]) mean(z)}, but without the intermediate array
      * allocation.
      *
      * @param x First array.
      * @param y Second array.
      * @return mean of paired differences
      * @throws IllegalArgumentException if the arrays do not have the same length.
      * @see #mean(double[])
      */
     static double meanDifference(double[] x, double[] y) {
         final int n = x.length;
         if (n != y.length) {
             throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
         }
         // See mean(double[]) for details.
         final double mean = IntStream.range(0, n).mapToDouble(i -> x[i] - y[i]).sum() / n;
         return mean + IntStream.range(0, n).mapToDouble(i -> (x[i] - y[i]) - mean).sum() / n;
     }

     /**
      * Returns the variance of the (signed) differences between corresponding elements of
      * the input arrays.
      *
      * <pre>
      * var(x[i] - y[i])
      * </pre>
      *
      * <p>This computes the same result as creating an array {@code z = x - y}
      * and calling {@link #variance(double[], double) variance(z, mean(z))}, but without the
      * intermediate array allocation.
      *
      * @param x First array.
      * @param y Second array.
      * @param mean the mean difference between corresponding entries
      * @return variance of paired differences
      * @throws IllegalArgumentException if the arrays do not have the same length.
      * @see #meanDifference(double[], double[])
      * @see #variance(double[], double)
      */
     static double varianceDifference(double[] x, double[] y, double mean) {
         final int n = x.length;
         if (n != y.length) {
             throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
         }
         // See variance(double[]) for details.
         if (n == 1) {
             return 0;
         }
         final double[] sumSq = {0};
         final double sum2 = IntStream.range(0, n).mapToDouble(i -> {
             final double dx = (x[i] - y[i]) - mean;
             sumSq[0] += dx * dx;
             return dx;
         }).sum();
         final double sum1 = sumSq[0];
         return (sum1 - (sum2 * sum2 / n)) / (n - 1);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.statistics.inference;

	import java.util.Arrays;
	import java.util.Collection;
	import java.util.stream.IntStream;
	import org.apache.commons.numbers.core.Precision;

	/**
	* Utility computation methods.
	*
	* @since 1.1
	*/
	final class StatisticUtils {
	/** No instances. */
	private StatisticUtils() {}

	/**
	* Compute {@code x - y}.
	*
	* <p>If {@code y} is zero the original array is returned, else a new array is created
	* with the difference.
	*
	* @param x Array.
	* @param y Value.
	* @return x - y
	* @throws NullPointerException if {@code x} is null and {@code y} is non-zero
	*/
	static double[] subtract(double[] x, double y) {
	return y == 0 ? x : Arrays.stream(x).map(v -> v - y).toArray();
	}

	/**
	* Compute the degrees of freedom as {@code n - 1 - m}.
	*
	* <p>This method is common functionality shared between the Chi-square test and
	* G-test. The pre-conditions for those tests are performed by this method.
	*
	* @param n Number of observations.
	* @param m Adjustment (assumed to be positive).
	* @return the degrees of freedom
	* @throws IllegalArgumentException if the degrees of freedom is not strictly positive
	*/
	static int computeDegreesOfFreedom(int n, int m) {
	final int df = n - 1 - m;
	if (df <= 0) {
	throw new InferenceException("Invalid degrees of freedom: " + df);
	}
	return df;
	}

	/**
	* Gets the ratio between the sum of the observed and expected values.
	* The ratio can be used to scale the expected values to have the same sum
	* as the observed values:
	*
	* <pre>
	* sum(o) = sum(e * ratio)
	* </pre>
	*
	* <p>This method is common functionality shared between the Chi-square test and
	* G-test. The pre-conditions for those tests are performed by this method.
	*
	* @param expected Expected values.
	* @param observed Observed values.
	* @return the ratio
	* @throws IllegalArgumentException if the sample size is less than 2; the array
	* sizes do not match; {@code expected} has entries that are not strictly
	* positive; {@code observed} has negative entries; or all the observations are zero.
	*/
	static double computeRatio(double[] expected, long[] observed) {
	Arguments.checkValuesRequiredSize(expected.length, 2);
	Arguments.checkValuesSizeMatch(expected.length, observed.length);
	Arguments.checkStrictlyPositive(expected);
	Arguments.checkNonNegative(observed);
	final DD e = DD.create();
	final DD o = DD.create();
	for (int i = 0; i < observed.length; i++) {
	DD.fastAdd(e.hi(), e.lo(), expected[i], e);
	add(o, observed[i]);
	}
	if (o.doubleValue() == 0) {
	throw new InferenceException(InferenceException.NO_DATA);
	}
	// sum(o) / sum(e)
	final double ratio = DD.divide(o.hi(), o.lo(), e.hi(), e.lo(), e).doubleValue();
	// Allow a sum within 1 ulp of 1.0
	return Precision.equals(ratio, 1.0, 0) ? 1.0 : ratio;
	}

	/**
	* Adds the value to the sum.
	*
	* @param sum Sum.
	* @param v Value.
	*/
	private static void add(DD sum, long v) {
	// Split into hi and lo parts so the high part has 53-bits
	final double hi = v;
	final long lo = v - (long) hi;
	// The condition here is a high probability branch if the sample is
	// frequency counts which are typically in the 32-bit integer range.
	if (lo == 0) {
	DD.fastAdd(sum.hi(), sum.lo(), hi, sum);
	} else {
	DD.fastAdd(sum.hi(), sum.lo(), hi, lo, sum);
	}
	}

	/**
	* Returns the arithmetic mean of the entries in the input array,
	* or {@code NaN} if the array is empty.
	*
	* @param x Values.
	* @return the mean of the values or NaN if length = 0
	*/
	static double mean(long[] x) {
	final int n = x.length;
	if (n == 0) {
	return Double.NaN;
	}

	// Single pass high accuracy sum. The total cannot be more than 2^63 * 2^31 bits
	// so can be exactly represented in a double-double. Cumulative error in the sum
	// is (n-1) * 4eps with eps = 2^-106. The sum should be exact to double precision.
	final DD dd = DD.create();
	for (final long v : x) {
	add(dd, v);
	}

	return DD.divide(dd.hi(), dd.lo(), n, 0, dd).doubleValue();
	}

	/**
	* Returns the arithmetic mean of the entries in the input array,
	* or {@code NaN} if the array is empty.
	*
	* <p>A two-pass, corrected algorithm is used, starting with the definitional formula
	* computed using the array of stored values and then correcting this by adding the
	* mean deviation of the data values from the arithmetic mean. See, e.g. "Comparison
	* of Several Algorithms for Computing Sample Means and Variances," Robert F. Ling,
	* Journal of the American Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp.
	* 859-866.
	*
	* @param x Values.
	* @return the mean of the values or NaN if length = 0
	*/
	static double mean(double[] x) {
	final int n = x.length;
	// No check for n == 0 -> return NaN.
	// This internal method is only called with non-zero length arrays.
	// The divide by zero creates NaN anyway.

	// Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Mean
	// Updated to use a stream to support high-precision summation as the stream maintains
	// a rounding-error term during the aggregation. This is important
	// when summing differences which can create cancellation: x + -x => 0.

	// Compute initial estimate using definitional formula
	final double mean = Arrays.stream(x).sum() / n;

	// Compute correction factor in second pass
	return mean + Arrays.stream(x).map(v -> v - mean).sum() / n;
	}

	/**
	* Returns the arithmetic mean of the entries in the input arrays,
	* or {@code NaN} if the combined length of the arrays is zero.
	*
	* <p>This is the equivalent of using {@link #mean(double[])} with all the samples
	* concatenated into a single array. Supports a combined length above the maximum array
	* size.
	*
	* @param samples Values.
	* @return the mean of the values or NaN if length = 0
	* @see #mean(double[])
	*/
	static double mean(Collection<double[]> samples) {
	// See above for computation details
	final long n = samples.stream().mapToInt(x -> x.length).sum();
	final double mean = samples.stream().flatMapToDouble(Arrays::stream).sum() / n;
	return mean + samples.stream().flatMapToDouble(Arrays::stream).map(v -> v - mean).sum() / n;
	}

	/**
	* Returns the variance of the entries in the input array, or {@code NaN} if the array
	* is empty.
	*
	* <p>This method returns the bias-corrected sample variance (using {@code n - 1} in
	* the denominator).
	*
	* <p>Uses a two-pass algorithm. Specifically, these methods use the "corrected
	* two-pass algorithm" from Chan, Golub, Levesque, <i>Algorithms for Computing the
	* Sample Variance</i>, American Statistician, vol. 37, no. 3 (1983) pp.
	* 242-247.
	*
	* <p>Returns 0 for a single-value (i.e. length = 1) sample.
	*
	* @param x Values.
	* @param mean the mean of the input array
	* @return the variance of the values or NaN if the array is empty
	*/
	static double variance(double[] x, double mean) {
	final int n = x.length;
	// No check for n == 0 -> return NaN.
	// This internal method is only called with non-zero length arrays.
	// The input mean of NaN for zero length creates NaN anyway.
	if (n == 1) {
	return 0;
	}

	// Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Variance
	// Use a stream to accumulate the sum of deviations in high precision.
	// This compensation term for the sum of deviations from the mean -> 0.
	// We sum the squares in standard precision as there is no cancellation of summands.
	final double[] sumSq = {0};
	final double sum2 = Arrays.stream(x).map(v -> {
	final double dx = v - mean;
	sumSq[0] += dx * dx;
	return dx;
	}).sum();

	final double sum1 = sumSq[0];
	// Bias corrected
	// Note: variance ~ sum1 / (n-1) but with a correction term sum2
	return (sum1 - (sum2 * sum2 / n)) / (n - 1);
	}

	/**
	* Returns the mean of the (signed) differences between corresponding elements of the
	* input arrays.
	*
	* <pre>
	* sum(x[i] - y[i]) / x.length
	* </pre>
	*
	* <p>This computes the same result as creating an array {@code z = x - y}
	* and calling {@link #mean(double[]) mean(z)}, but without the intermediate array
	* allocation.
	*
	* @param x First array.
	* @param y Second array.
	* @return mean of paired differences
	* @throws IllegalArgumentException if the arrays do not have the same length.
	* @see #mean(double[])
	*/
	static double meanDifference(double[] x, double[] y) {
	final int n = x.length;
	if (n != y.length) {
	throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
	}
	// See mean(double[]) for details.
	final double mean = IntStream.range(0, n).mapToDouble(i -> x[i] - y[i]).sum() / n;
	return mean + IntStream.range(0, n).mapToDouble(i -> (x[i] - y[i]) - mean).sum() / n;
	}

	/**
	* Returns the variance of the (signed) differences between corresponding elements of
	* the input arrays.
	*
	* <pre>
	* var(x[i] - y[i])
	* </pre>
	*
	* <p>This computes the same result as creating an array {@code z = x - y}
	* and calling {@link #variance(double[], double) variance(z, mean(z))}, but without the
	* intermediate array allocation.
	*
	* @param x First array.
	* @param y Second array.
	* @param mean the mean difference between corresponding entries
	* @return variance of paired differences
	* @throws IllegalArgumentException if the arrays do not have the same length.
	* @see #meanDifference(double[], double[])
	* @see #variance(double[], double)
	*/
	static double varianceDifference(double[] x, double[] y, double mean) {
	final int n = x.length;
	if (n != y.length) {
	throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
	}
	// See variance(double[]) for details.
	if (n == 1) {
	return 0;
	}
	final double[] sumSq = {0};
	final double sum2 = IntStream.range(0, n).mapToDouble(i -> {
	final double dx = (x[i] - y[i]) - mean;
	sumSq[0] += dx * dx;
	return dx;
	}).sum();
	final double sum1 = sumSq[0];
	return (sum1 - (sum2 * sum2 / n)) / (n - 1);
	}
	}