blob: 5c94ba2e3c04b0046d3b14e570b2bdfad2ed0b45 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import java.lang.ref.SoftReference;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Objects;
import java.util.stream.IntStream;
import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
import org.apache.commons.statistics.distribution.NormalDistribution;
import org.apache.commons.statistics.ranking.NaNStrategy;
import org.apache.commons.statistics.ranking.NaturalRanking;
import org.apache.commons.statistics.ranking.RankingAlgorithm;
import org.apache.commons.statistics.ranking.TiesStrategy;
/**
* Implements the Mann-Whitney U test (also called Wilcoxon rank-sum test).
*
* @see <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">
* Mann-Whitney U test (Wikipedia)</a>
* @since 1.1
*/
public final class MannWhitneyUTest {
/** Limit on sample size for the exact p-value computation for the auto mode. */
private static final int AUTO_LIMIT = 50;
/** Ranking instance. */
private static final RankingAlgorithm RANKING = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE);
/** Value for an unset f computation. */
private static final double UNSET = -1;
/** An object to use for synchonization when accessing the cache of F. */
private static final Object LOCK = new Object();
/** A reference to a previously computed storage for f.
* Use of a SoftReference ensures this is garbage collected before an OutOfMemoryError.
* The value should only be accessed, checked for size and optionally
* modified when holding the lock. When the storage is determined to be the correct
* size it can be returned for read/write to the array when not holding the lock. */
private static SoftReference<double[][][]> cacheF = new SoftReference<>(null);
/** Default instance. */
private static final MannWhitneyUTest DEFAULT = new MannWhitneyUTest(
AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, true, 0);
/** Alternative hypothesis. */
private final AlternativeHypothesis alternative;
/** Method to compute the p-value. */
private final PValueMethod pValueMethod;
/** Perform continuity correction. */
private final boolean continuityCorrection;
/** Expected location shift. */
private final double mu;
/**
* Result for the Mann-Whitney U test.
*
* <p>This class is immutable.
*
* @since 1.1
*/
public static final class Result extends BaseSignificanceResult {
/** Flag indicating the data has tied values. */
private final boolean tiedValues;
/**
* Create an instance.
*
* @param statistic Test statistic.
* @param tiedValues Flag indicating the data has tied values.
* @param p Result p-value.
*/
Result(double statistic, boolean tiedValues, double p) {
super(statistic, p);
this.tiedValues = tiedValues;
}
/**
* {@inheritDoc}
*
* <p>This is the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using
* the original sample lengths {@code n} and {@code m} using:
* <pre>
* u2 = (long) n * m - u1;
* </pre>
*/
@Override
public double getStatistic() {
// Note: This method is here for documentation
return super.getStatistic();
}
/**
* Return {@code true} if the data had tied values.
*
* <p>Note: The exact computation cannot be used when there are tied values.
*
* @return {@code true} if there were tied values
*/
public boolean hasTiedValues() {
return tiedValues;
}
}
/**
* @param alternative Alternative hypothesis.
* @param method P-value method.
* @param continuityCorrection true to perform continuity correction.
* @param mu Expected location shift.
*/
private MannWhitneyUTest(AlternativeHypothesis alternative, PValueMethod method,
boolean continuityCorrection, double mu) {
this.alternative = alternative;
this.pValueMethod = method;
this.continuityCorrection = continuityCorrection;
this.mu = mu;
}
/**
* Return an instance using the default options.
*
* <ul>
* <li>{@link AlternativeHypothesis#TWO_SIDED}
* <li>{@link PValueMethod#AUTO}
* <li>{@link ContinuityCorrection#ENABLED}
* <li>{@linkplain #withMu(double) mu = 0}
* </ul>
*
* @return default instance
*/
public static MannWhitneyUTest withDefaults() {
return DEFAULT;
}
/**
* Return an instance with the configured alternative hypothesis.
*
* @param v Value.
* @return an instance
*/
public MannWhitneyUTest with(AlternativeHypothesis v) {
return new MannWhitneyUTest(Objects.requireNonNull(v), pValueMethod, continuityCorrection, mu);
}
/**
* Return an instance with the configured p-value method.
*
* @param v Value.
* @return an instance
* @throws IllegalArgumentException if the value is not in the allowed options or is null
*/
public MannWhitneyUTest with(PValueMethod v) {
return new MannWhitneyUTest(alternative,
Arguments.checkOption(v, EnumSet.of(PValueMethod.AUTO, PValueMethod.EXACT, PValueMethod.ASYMPTOTIC)),
continuityCorrection, mu);
}
/**
* Return an instance with the configured continuity correction.
*
* <p>If {@link ContinuityCorrection#ENABLED ENABLED}, adjust the U rank statistic by
* 0.5 towards the mean value when computing the z-statistic if a normal approximation is used
* to compute the p-value.
*
* @param v Value.
* @return an instance
*/
public MannWhitneyUTest with(ContinuityCorrection v) {
return new MannWhitneyUTest(alternative, pValueMethod,
Objects.requireNonNull(v) == ContinuityCorrection.ENABLED, mu);
}
/**
* Return an instance with the configured location shift {@code mu}.
*
* @param v Value.
* @return an instance
* @throws IllegalArgumentException if the value is not finite
*/
public MannWhitneyUTest withMu(double v) {
return new MannWhitneyUTest(alternative, pValueMethod, continuityCorrection, Arguments.checkFinite(v));
}
/**
* Computes the Mann-Whitney U statistic comparing two independent
* samples possibly of different length.
*
* <p>This statistic can be used to perform a Mann-Whitney U test evaluating the
* null hypothesis that the two independent samples differ by a location shift of {@code mu}.
*
* <p>This returns the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using:
* <pre>
* u2 = (long) x.length * y.length - u1;
* </pre>
*
* @param x First sample values.
* @param y Second sample values.
* @return Mann-Whitney U<sub>1</sub> statistic
* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
* NaN values.
* @see #withMu(double)
*/
public double statistic(double[] x, double[] y) {
checkSamples(x, y);
final double[] z = concatenateSamples(mu, x, y);
final double[] ranks = RANKING.apply(z);
// The ranks for x is in the first x.length entries in ranks because x
// is in the first x.length entries in z
final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
// U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
// e.g. x, n1 is the number of observations in sample 1.
return sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
}
/**
* Performs a Mann-Whitney U test comparing the location for two independent
* samples. The location is specified using {@link #withMu(double) mu}.
*
* <p>The test is defined by the {@link AlternativeHypothesis}.
* <ul>
* <li>'two-sided': the distribution underlying {@code (x - mu)} is not equal to the
* distribution underlying {@code y}.
* <li>'greater': the distribution underlying {@code (x - mu)} is stochastically greater than
* the distribution underlying {@code y}.
* <li>'less': the distribution underlying {@code (x - mu)} is stochastically less than
* the distribution underlying {@code y}.
* </ul>
*
* <p>If the p-value method is {@linkplain PValueMethod#AUTO auto} an exact p-value is
* computed if the samples contain less than 50 values; otherwise a normal
* approximation is used.
*
* <p>Computation of the exact p-value is only valid if there are no tied
* ranks in the data; otherwise the p-value resorts to the asymptotic
* approximation using a tie correction and an optional continuity correction.
*
* <p><strong>Note: </strong>
* Exact computation requires tabulation of values not exceeding size
* {@code (n+1)*(m+1)*(u+1)} where {@code u} is the minimum of the U<sub>1</sub> and
* U<sub>2</sub> statistics and {@code n} and {@code m} are the sample sizes.
* This may use a very large amount of memory and result in an {@link OutOfMemoryError}.
* Exact computation requires a finite binomial coefficient {@code binom(n+m, m)}
* which is limited to {@code n+m <= 1029} for any {@code n} and {@code m},
* or {@code min(n, m) <= 37} for any {@code max(n, m)}.
* An {@link OutOfMemoryError} is not expected using the
* limits configured for the {@linkplain PValueMethod#AUTO auto} p-value computation
* as the maximum required memory is approximately 23 MiB.
*
* @param x First sample values.
* @param y Second sample values.
* @return test result
* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
* NaN values.
* @throws OutOfMemoryError if the exact computation is <em>user-requested</em> for
* large samples and there is not enough memory.
* @see #statistic(double[], double[])
* @see #withMu(double)
* @see #with(AlternativeHypothesis)
* @see #with(ContinuityCorrection)
*/
public Result test(double[] x, double[] y) {
// Computation as above. The ranks are required for tie correction.
checkSamples(x, y);
final double[] z = concatenateSamples(mu, x, y);
final double[] ranks = RANKING.apply(z);
final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
final double u1 = sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
final double c = WilcoxonSignedRankTest.calculateTieCorrection(ranks);
final boolean tiedValues = c != 0;
PValueMethod method = pValueMethod;
final int n = x.length;
final int m = y.length;
if (method == PValueMethod.AUTO && Math.max(n, m) < AUTO_LIMIT) {
method = PValueMethod.EXACT;
}
// Exact p requires no ties.
// The method will fail-fast if the computation is not possible due
// to the size of the data.
double p = method == PValueMethod.EXACT && !tiedValues ?
calculateExactPValue(u1, n, m, alternative) : -1;
if (p < 0) {
p = calculateAsymptoticPValue(u1, n, m, c);
}
return new Result(u1, tiedValues, p);
}
/**
* Ensures that the provided arrays fulfil the assumptions.
*
* @param x First sample values.
* @param y Second sample values.
* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length.
*/
private static void checkSamples(double[] x, double[] y) {
Arguments.checkValuesRequiredSize(x.length, 1);
Arguments.checkValuesRequiredSize(y.length, 1);
}
/**
* Concatenate the samples into one array. Subtract {@code mu} from the first sample.
*
* @param mu Expected difference between means.
* @param x First sample values.
* @param y Second sample values.
* @return concatenated array
*/
private static double[] concatenateSamples(double mu, double[] x, double[] y) {
final double[] z = new double[x.length + y.length];
System.arraycopy(x, 0, z, 0, x.length);
System.arraycopy(y, 0, z, x.length, y.length);
if (mu != 0) {
for (int i = 0; i < x.length; i++) {
z[i] -= mu;
}
}
return z;
}
/**
* Calculate the asymptotic p-value using a Normal approximation.
*
* @param u Mann-Whitney U value.
* @param n1 Number of subjects in first sample.
* @param n2 Number of subjects in second sample.
* @param c Tie-correction
* @return two-sided asymptotic p-value
*/
private double calculateAsymptoticPValue(double u, int n1, int n2, double c) {
// Use long to avoid overflow
final long n1n2 = (long) n1 * n2;
final long n = (long) n1 + n2;
// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
final double e = n1n2 * 0.5;
final double variance = (n1n2 / 12.0) * ((n + 1.0) - c / n / (n - 1));
double z = u - e;
if (continuityCorrection) {
// +/- 0.5 is a continuity correction towards the expected.
if (alternative == AlternativeHypothesis.GREATER_THAN) {
z -= 0.5;
} else if (alternative == AlternativeHypothesis.LESS_THAN) {
z += 0.5;
} else {
// two-sided. Shift towards the expected of zero.
// Use of signum ignores x==0 (i.e. not copySign(0.5, z))
z -= Math.signum(z) * 0.5;
}
}
z /= Math.sqrt(variance);
final NormalDistribution standardNormal = NormalDistribution.of(0, 1);
if (alternative == AlternativeHypothesis.GREATER_THAN) {
return standardNormal.survivalProbability(z);
}
if (alternative == AlternativeHypothesis.LESS_THAN) {
return standardNormal.cumulativeProbability(z);
}
// two-sided
return 2 * standardNormal.survivalProbability(Math.abs(z));
}
/**
* Calculate the exact p-value. If the value cannot be computed this returns -1.
*
* <p>Note: Computation may run out of memory during array allocation, or method
* recursion.
*
* @param u Mann-Whitney U value.
* @param m Number of subjects in first sample.
* @param n Number of subjects in second sample.
* @param alternative Alternative hypothesis.
* @return exact p-value (or -1) (two-sided, greater, or less using the options)
*/
// package-private for testing
static double calculateExactPValue(double u, int m, int n, AlternativeHypothesis alternative) {
// Check the computation can be attempted.
// u must be an integer
if ((int) u != u) {
return -1;
}
// Note: n+m will not overflow as we concatenated the samples to a single array.
final double binom = BinomialCoefficientDouble.value(n + m, m);
if (binom == Double.POSITIVE_INFINITY) {
return -1;
}
// Use u_min for the CDF.
final int u1 = (int) u;
final int u2 = (int) ((long) m * n - u1);
// Use m < n to support symmetry.
final int n1 = Math.min(m, n);
final int n2 = Math.max(m, n);
// Return the correct side:
if (alternative == AlternativeHypothesis.GREATER_THAN) {
// sf(u1 - 1)
return sf(u1 - 1, u2 + 1, n1, n2, binom);
}
if (alternative == AlternativeHypothesis.LESS_THAN) {
// cdf(u1)
return cdf(u1, u2, n1, n2, binom);
}
// two-sided: 2 * sf(max(u1, u2) - 1) or 2 * cdf(min(u1, u2))
final double p = 2 * computeCdf(Math.min(u1, u2), n1, n2, binom);
// Clip to range: [0, 1]
return Math.min(1, p);
}
/**
* Compute the cumulative density function of the Mann-Whitney U1 statistic.
* The U2 statistic is passed for convenience to exploit symmetry in the distribution.
*
* @param u1 Mann-Whitney U1 statistic
* @param u2 Mann-Whitney U2 statistic
* @param m First sample size.
* @param n Second sample size.
* @param binom binom(n+m, m) (must be finite)
* @return {@code Pr(X <= k)}
*/
private static double cdf(int u1, int u2, int m, int n, double binom) {
// Exploit symmetry. Note the distribution is discrete thus requiring (u2 - 1).
return u2 > u1 ?
computeCdf(u1, m, n, binom) :
1 - computeCdf(u2 - 1, m, n, binom);
}
/**
* Compute the survival function of the Mann-Whitney U1 statistic.
* The U2 statistic is passed for convenience to exploit symmetry in the distribution.
*
* @param u1 Mann-Whitney U1 statistic
* @param u2 Mann-Whitney U2 statistic
* @param m First sample size.
* @param n Second sample size.
* @param binom binom(n+m, m) (must be finite)
* @return {@code Pr(X > k)}
*/
private static double sf(int u1, int u2, int m, int n, double binom) {
// Opposite of the CDF
return u2 > u1 ?
1 - computeCdf(u1, m, n, binom) :
computeCdf(u2 - 1, m, n, binom);
}
/**
* Compute the cumulative density function of the Mann-Whitney U statistic.
*
* <p>This should be called with the lower of U1 or U2 for computational efficiency.
*
* <p>Uses the recursive formula provided in Bucchianico, A.D, (1999)
* Combinatorics, computer algebra and the Wilcoxon-Mann-Whitney test, Journal
* of Statistical Planning and Inference, Volume 79, Issue 2, 349-364.
*
* @param k Mann-Whitney U statistic
* @param m First sample size.
* @param n Second sample size.
* @param binom binom(n+m, m) (must be finite)
* @return {@code Pr(X <= k)}
*/
private static double computeCdf(int k, int m, int n, double binom) {
// Theorem 2.5:
// f(m, n, k) = 0 if k < 0, m < 0, n < 0, k > nm
if (k < 0) {
return 0;
}
// Recursively compute f(m, n, k)
final double[][][] f = getF(m, n, k);
// P(X=k) = f(m, n, k) / binom(m+n, m)
// P(X<=k) = sum_0^k (P(X=i))
// Called with k = min(u1, u2) : max(p) ~ 0.5 so no need to clip to [0, 1]
return IntStream.rangeClosed(0, k).mapToDouble(i -> fmnk(f, m, n, i)).sum() / binom;
}
/**
* Gets the storage for f(m, n, k).
*
* <p>This may be cached for performance.
*
* @param m M.
* @param n N.
* @param k K.
* @return the storage for f
*/
private static double[][][] getF(int m, int n, int k) {
// Obtain any previous computation of f and expand it if required.
// F is only modified within this synchronized block.
// Any concurrent threads using a reference returned by this method
// will not receive an index out-of-bounds as f is only ever expanded.
synchronized (LOCK) {
// Note: f(x<m, y<n, z<k) is always the same.
// Cache the array and re-use any previous computation.
double[][][] f = cacheF.get();
// Require:
// f = new double[m + 1][n + 1][k + 1]
// f(m, n, 0) == 1; otherwise -1 if not computed
// m+n <= 1029 for any m,n; k < mn/2 (due to symmetry using min(u1, u2))
// Size m=n=515: approximately 516^2 * 515^2/2 = 398868 doubles ~ 3.04 GiB
if (f == null) {
f = new double[m + 1][n + 1][k + 1];
for (final double[][] a : f) {
for (final double[] b : a) {
initialize(b);
}
}
// Cache for reuse.
cacheF = new SoftReference<>(f);
return f;
}
// Grow if required: m1 < m+1 => m1-(m+1) < 0 => m1 - m < 1
final int m1 = f.length;
final int n1 = f[0].length;
final int k1 = f[0][0].length;
final boolean growM = m1 - m < 1;
final boolean growN = n1 - n < 1;
final boolean growK = k1 - k < 1;
if (growM | growN | growK) {
// Some part of the previous f is too small.
// Atomically grow without destroying the previous computation.
// Any other thread using the previous f will not go out of bounds
// by keeping the new f dimensions at least as large.
// Note: Doing this in-place allows the memory to be gradually
// increased rather than allocating a new [m + 1][n + 1][k + 1]
// and copying all old values.
final int sn = Math.max(n1, n + 1);
final int sk = Math.max(k1, k + 1);
if (growM) {
// Entirely new region
f = Arrays.copyOf(f, m + 1);
for (int x = m1; x <= m; x++) {
f[x] = new double[sn][sk];
for (final double[] b : f[x]) {
initialize(b);
}
}
}
// Expand previous in place if required
if (growN) {
for (int x = 0; x < m1; x++) {
f[x] = Arrays.copyOf(f[x], sn);
for (int y = n1; y < sn; y++) {
final double[] b = f[x][y] = new double[sk];
initialize(b);
}
}
}
if (growK) {
for (int x = 0; x < m1; x++) {
for (int y = 0; y < n1; y++) {
final double[] b = f[x][y] = Arrays.copyOf(f[x][y], sk);
for (int z = k1; z < sk; z++) {
b[z] = UNSET;
}
}
}
}
// Avoided an OutOfMemoryError. Cache for reuse.
cacheF = new SoftReference<>(f);
}
return f;
}
}
/**
* Initialize the array for f(m, n, x).
* Set value to 1 for x=0; otherwise {@link #UNSET}.
*
* @param fmn Array.
*/
private static void initialize(double[] fmn) {
Arrays.fill(fmn, UNSET);
// f(m, n, 0) == 1 if m >= 0, n >= 0
fmn[0] = 1;
}
/**
* Compute f(m; n; k), the number of subsets of {0; 1; ...; n} with m elements such
* that the elements of this subset add up to k.
*
* <p>The function is computed recursively.
*
* @param f Tabulated values of f[m][n][k].
* @param m M
* @param n N
* @param k K
* @return f(m; n; k)
*/
private static double fmnk(double[][][] f, int m, int n, int k) {
// Theorem 2.5:
// Omit conditions that will not be met: k > mn
// f(m, n, k) = 0 if k < 0, m < 0, n < 0
if ((k | m | n) < 0) {
return 0;
}
// Compute on demand
double fmnk = f[m][n][k];
if (fmnk < 0) {
// f(m, n, 0) == 1 if m >= 0, n >= 0
// This is already computed.
// Recursion from formula (3):
// f(m, n, k) = f(m-1, n, k-n) + f(m, n-1, k)
f[m][n][k] = fmnk = fmnk(f, m - 1, n, k - n) + fmnk(f, m, n - 1, k);
}
return fmnk;
}
}