blob: 88241ef16b764f6d57011f99025ac09197b66318 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import java.util.Arrays;
import java.util.Collection;
import java.util.stream.IntStream;
import org.apache.commons.numbers.core.Precision;
/**
* Utility computation methods.
*
* @since 1.1
*/
final class StatisticUtils {
/** No instances. */
private StatisticUtils() {}
/**
* Compute {@code x - y}.
*
* <p>If {@code y} is zero the original array is returned, else a new array is created
* with the difference.
*
* @param x Array.
* @param y Value.
* @return x - y
* @throws NullPointerException if {@code x} is null and {@code y} is non-zero
*/
static double[] subtract(double[] x, double y) {
return y == 0 ? x : Arrays.stream(x).map(v -> v - y).toArray();
}
/**
* Compute the degrees of freedom as {@code n - 1 - m}.
*
* <p>This method is common functionality shared between the Chi-square test and
* G-test. The pre-conditions for those tests are performed by this method.
*
* @param n Number of observations.
* @param m Adjustment (assumed to be positive).
* @return the degrees of freedom
* @throws IllegalArgumentException if the degrees of freedom is not strictly positive
*/
static int computeDegreesOfFreedom(int n, int m) {
final int df = n - 1 - m;
if (df <= 0) {
throw new InferenceException("Invalid degrees of freedom: " + df);
}
return df;
}
/**
* Gets the ratio between the sum of the observed and expected values.
* The ratio can be used to scale the expected values to have the same sum
* as the observed values:
*
* <pre>
* sum(o) = sum(e * ratio)
* </pre>
*
* <p>This method is common functionality shared between the Chi-square test and
* G-test. The pre-conditions for those tests are performed by this method.
*
* @param expected Expected values.
* @param observed Observed values.
* @return the ratio
* @throws IllegalArgumentException if the sample size is less than 2; the array
* sizes do not match; {@code expected} has entries that are not strictly
* positive; {@code observed} has negative entries; or all the observations are zero.
*/
static double computeRatio(double[] expected, long[] observed) {
Arguments.checkValuesRequiredSize(expected.length, 2);
Arguments.checkValuesSizeMatch(expected.length, observed.length);
Arguments.checkStrictlyPositive(expected);
Arguments.checkNonNegative(observed);
final DD e = DD.create();
final DD o = DD.create();
for (int i = 0; i < observed.length; i++) {
DD.fastAdd(e.hi(), e.lo(), expected[i], e);
add(o, observed[i]);
}
if (o.doubleValue() == 0) {
throw new InferenceException(InferenceException.NO_DATA);
}
// sum(o) / sum(e)
final double ratio = DD.divide(o.hi(), o.lo(), e.hi(), e.lo(), e).doubleValue();
// Allow a sum within 1 ulp of 1.0
return Precision.equals(ratio, 1.0, 0) ? 1.0 : ratio;
}
/**
* Adds the value to the sum.
*
* @param sum Sum.
* @param v Value.
*/
private static void add(DD sum, long v) {
// Split into hi and lo parts so the high part has 53-bits
final double hi = v;
final long lo = v - (long) hi;
// The condition here is a high probability branch if the sample is
// frequency counts which are typically in the 32-bit integer range.
if (lo == 0) {
DD.fastAdd(sum.hi(), sum.lo(), hi, sum);
} else {
DD.fastAdd(sum.hi(), sum.lo(), hi, lo, sum);
}
}
/**
* Returns the arithmetic mean of the entries in the input array,
* or {@code NaN} if the array is empty.
*
* @param x Values.
* @return the mean of the values or NaN if length = 0
*/
static double mean(long[] x) {
final int n = x.length;
if (n == 0) {
return Double.NaN;
}
// Single pass high accuracy sum. The total cannot be more than 2^63 * 2^31 bits
// so can be exactly represented in a double-double. Cumulative error in the sum
// is (n-1) * 4eps with eps = 2^-106. The sum should be exact to double precision.
final DD dd = DD.create();
for (final long v : x) {
add(dd, v);
}
return DD.divide(dd.hi(), dd.lo(), n, 0, dd).doubleValue();
}
/**
* Returns the arithmetic mean of the entries in the input array,
* or {@code NaN} if the array is empty.
*
* <p>A two-pass, corrected algorithm is used, starting with the definitional formula
* computed using the array of stored values and then correcting this by adding the
* mean deviation of the data values from the arithmetic mean. See, e.g. "Comparison
* of Several Algorithms for Computing Sample Means and Variances," Robert F. Ling,
* Journal of the American Statistical Association, Vol. 69, No. 348 (Dec., 1974), pp.
* 859-866.
*
* @param x Values.
* @return the mean of the values or NaN if length = 0
*/
static double mean(double[] x) {
final int n = x.length;
// No check for n == 0 -> return NaN.
// This internal method is only called with non-zero length arrays.
// The divide by zero creates NaN anyway.
// Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Mean
// Updated to use a stream to support high-precision summation as the stream maintains
// a rounding-error term during the aggregation. This is important
// when summing differences which can create cancellation: x + -x => 0.
// Compute initial estimate using definitional formula
final double mean = Arrays.stream(x).sum() / n;
// Compute correction factor in second pass
return mean + Arrays.stream(x).map(v -> v - mean).sum() / n;
}
/**
* Returns the arithmetic mean of the entries in the input arrays,
* or {@code NaN} if the combined length of the arrays is zero.
*
* <p>This is the equivalent of using {@link #mean(double[])} with all the samples
* concatenated into a single array. Supports a combined length above the maximum array
* size.
*
* @param samples Values.
* @return the mean of the values or NaN if length = 0
* @see #mean(double[])
*/
static double mean(Collection<double[]> samples) {
// See above for computation details
final long n = samples.stream().mapToInt(x -> x.length).sum();
final double mean = samples.stream().flatMapToDouble(Arrays::stream).sum() / n;
return mean + samples.stream().flatMapToDouble(Arrays::stream).map(v -> v - mean).sum() / n;
}
/**
* Returns the variance of the entries in the input array, or {@code NaN} if the array
* is empty.
*
* <p>This method returns the bias-corrected sample variance (using {@code n - 1} in
* the denominator).
*
* <p>Uses a two-pass algorithm. Specifically, these methods use the "corrected
* two-pass algorithm" from Chan, Golub, Levesque, <i>Algorithms for Computing the
* Sample Variance</i>, American Statistician, vol. 37, no. 3 (1983) pp.
* 242-247.
*
* <p>Returns 0 for a single-value (i.e. length = 1) sample.
*
* @param x Values.
* @param mean the mean of the input array
* @return the variance of the values or NaN if the array is empty
*/
static double variance(double[] x, double mean) {
final int n = x.length;
// No check for n == 0 -> return NaN.
// This internal method is only called with non-zero length arrays.
// The input mean of NaN for zero length creates NaN anyway.
if (n == 1) {
return 0;
}
// Adapted from org.apache.commons.math4.legacy.stat.descriptive.moment.Variance
// Use a stream to accumulate the sum of deviations in high precision.
// This compensation term for the sum of deviations from the mean -> 0.
// We sum the squares in standard precision as there is no cancellation of summands.
final double[] sumSq = {0};
final double sum2 = Arrays.stream(x).map(v -> {
final double dx = v - mean;
sumSq[0] += dx * dx;
return dx;
}).sum();
final double sum1 = sumSq[0];
// Bias corrected
// Note: variance ~ sum1 / (n-1) but with a correction term sum2
return (sum1 - (sum2 * sum2 / n)) / (n - 1);
}
/**
* Returns the mean of the (signed) differences between corresponding elements of the
* input arrays.
*
* <pre>
* sum(x[i] - y[i]) / x.length
* </pre>
*
* <p>This computes the same result as creating an array {@code z = x - y}
* and calling {@link #mean(double[]) mean(z)}, but without the intermediate array
* allocation.
*
* @param x First array.
* @param y Second array.
* @return mean of paired differences
* @throws IllegalArgumentException if the arrays do not have the same length.
* @see #mean(double[])
*/
static double meanDifference(double[] x, double[] y) {
final int n = x.length;
if (n != y.length) {
throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
}
// See mean(double[]) for details.
final double mean = IntStream.range(0, n).mapToDouble(i -> x[i] - y[i]).sum() / n;
return mean + IntStream.range(0, n).mapToDouble(i -> (x[i] - y[i]) - mean).sum() / n;
}
/**
* Returns the variance of the (signed) differences between corresponding elements of
* the input arrays.
*
* <pre>
* var(x[i] - y[i])
* </pre>
*
* <p>This computes the same result as creating an array {@code z = x - y}
* and calling {@link #variance(double[], double) variance(z, mean(z))}, but without the
* intermediate array allocation.
*
* @param x First array.
* @param y Second array.
* @param mean the mean difference between corresponding entries
* @return variance of paired differences
* @throws IllegalArgumentException if the arrays do not have the same length.
* @see #meanDifference(double[], double[])
* @see #variance(double[], double)
*/
static double varianceDifference(double[] x, double[] y, double mean) {
final int n = x.length;
if (n != y.length) {
throw new InferenceException(InferenceException.VALUES_MISMATCH, n, y.length);
}
// See variance(double[]) for details.
if (n == 1) {
return 0;
}
final double[] sumSq = {0};
final double sum2 = IntStream.range(0, n).mapToDouble(i -> {
final double dx = (x[i] - y[i]) - mean;
sumSq[0] += dx * dx;
return dx;
}).sum();
final double sum1 = sumSq[0];
return (sum1 - (sum2 * sum2 / n)) / (n - 1);
}
}