commons-statistics-inference/src/main/java/org/apache/commons/statistics/inference/MannWhitneyUTest.java - commons-statistics - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.statistics.inference;

 import java.lang.ref.SoftReference;
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.Objects;
 import java.util.stream.IntStream;
 import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
 import org.apache.commons.statistics.distribution.NormalDistribution;
 import org.apache.commons.statistics.ranking.NaNStrategy;
 import org.apache.commons.statistics.ranking.NaturalRanking;
 import org.apache.commons.statistics.ranking.RankingAlgorithm;
 import org.apache.commons.statistics.ranking.TiesStrategy;

 /**
  * Implements the Mann-Whitney U test (also called Wilcoxon rank-sum test).
  *
  * @see <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">
  * Mann-Whitney U test (Wikipedia)</a>
  * @since 1.1
  */
 public final class MannWhitneyUTest {
     /** Limit on sample size for the exact p-value computation for the auto mode. */
     private static final int AUTO_LIMIT = 50;
     /** Ranking instance. */
     private static final RankingAlgorithm RANKING = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE);
     /** Value for an unset f computation. */
     private static final double UNSET = -1;
     /** An object to use for synchonization when accessing the cache of F. */
     private static final Object LOCK = new Object();
     /** A reference to a previously computed storage for f.
      * Use of a SoftReference ensures this is garbage collected before an OutOfMemoryError.
      * The value should only be accessed, checked for size and optionally
      * modified when holding the lock. When the storage is determined to be the correct
      * size it can be returned for read/write to the array when not holding the lock. */
     private static SoftReference<double[][][]> cacheF = new SoftReference<>(null);
     /** Default instance. */
     private static final MannWhitneyUTest DEFAULT = new MannWhitneyUTest(
         AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, true, 0);

     /** Alternative hypothesis. */
     private final AlternativeHypothesis alternative;
     /** Method to compute the p-value. */
     private final PValueMethod pValueMethod;
     /** Perform continuity correction. */
     private final boolean continuityCorrection;
     /** Expected location shift. */
     private final double mu;

     /**
      * Result for the Mann-Whitney U test.
      *
      * <p>This class is immutable.
      *
      * @since 1.1
      */
     public static final class Result extends BaseSignificanceResult {
         /** Flag indicating the data has tied values. */
         private final boolean tiedValues;

         /**
          * Create an instance.
          *
          * @param statistic Test statistic.
          * @param tiedValues Flag indicating the data has tied values.
          * @param p Result p-value.
          */
         Result(double statistic, boolean tiedValues, double p) {
             super(statistic, p);
             this.tiedValues = tiedValues;
         }

         /**
          * {@inheritDoc}
          *
          * <p>This is the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using
          * the original sample lengths {@code n} and {@code m} using:
          * <pre>
          * u2 = (long) n * m - u1;
          * </pre>
          */
         @Override
         public double getStatistic() {
             // Note: This method is here for documentation
             return super.getStatistic();
         }

         /**
          * Return {@code true} if the data had tied values.
          *
          * <p>Note: The exact computation cannot be used when there are tied values.
          *
          * @return {@code true} if there were tied values
          */
         public boolean hasTiedValues() {
             return tiedValues;
         }
     }

     /**
      * @param alternative Alternative hypothesis.
      * @param method P-value method.
      * @param continuityCorrection true to perform continuity correction.
      * @param mu Expected location shift.
      */
     private MannWhitneyUTest(AlternativeHypothesis alternative, PValueMethod method,
         boolean continuityCorrection, double mu) {
         this.alternative = alternative;
         this.pValueMethod = method;
         this.continuityCorrection = continuityCorrection;
         this.mu = mu;
     }

     /**
      * Return an instance using the default options.
      *
      * <ul>
      * <li>{@link AlternativeHypothesis#TWO_SIDED}
      * <li>{@link PValueMethod#AUTO}
      * <li>{@link ContinuityCorrection#ENABLED}
      * <li>{@linkplain #withMu(double) mu = 0}
      * </ul>
      *
      * @return default instance
      */
     public static MannWhitneyUTest withDefaults() {
         return DEFAULT;
     }

     /**
      * Return an instance with the configured alternative hypothesis.
      *
      * @param v Value.
      * @return an instance
      */
     public MannWhitneyUTest with(AlternativeHypothesis v) {
         return new MannWhitneyUTest(Objects.requireNonNull(v), pValueMethod, continuityCorrection, mu);
     }

     /**
      * Return an instance with the configured p-value method.
      *
      * @param v Value.
      * @return an instance
      * @throws IllegalArgumentException if the value is not in the allowed options or is null
      */
     public MannWhitneyUTest with(PValueMethod v) {
         return new MannWhitneyUTest(alternative,
             Arguments.checkOption(v, EnumSet.of(PValueMethod.AUTO, PValueMethod.EXACT, PValueMethod.ASYMPTOTIC)),
             continuityCorrection, mu);
     }

     /**
      * Return an instance with the configured continuity correction.
      *
      * <p>If {@link ContinuityCorrection#ENABLED ENABLED}, adjust the U rank statistic by
      * 0.5 towards the mean value when computing the z-statistic if a normal approximation is used
      * to compute the p-value.
      *
      * @param v Value.
      * @return an instance
      */
     public MannWhitneyUTest with(ContinuityCorrection v) {
         return new MannWhitneyUTest(alternative, pValueMethod,
             Objects.requireNonNull(v) == ContinuityCorrection.ENABLED, mu);
     }

     /**
      * Return an instance with the configured location shift {@code mu}.
      *
      * @param v Value.
      * @return an instance
      * @throws IllegalArgumentException if the value is not finite
      */
     public MannWhitneyUTest withMu(double v) {
         return new MannWhitneyUTest(alternative, pValueMethod, continuityCorrection, Arguments.checkFinite(v));
     }

     /**
      * Computes the Mann-Whitney U statistic comparing two independent
      * samples possibly of different length.
      *
      * <p>This statistic can be used to perform a Mann-Whitney U test evaluating the
      * null hypothesis that the two independent samples differ by a location shift of {@code mu}.
      *
      * <p>This returns the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using:
      * <pre>
      * u2 = (long) x.length * y.length - u1;
      * </pre>
      *
      * @param x First sample values.
      * @param y Second sample values.
      * @return Mann-Whitney U<sub>1</sub> statistic
      * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
      * NaN values.
      * @see #withMu(double)
      */
     public double statistic(double[] x, double[] y) {
         checkSamples(x, y);

         final double[] z = concatenateSamples(mu, x, y);
         final double[] ranks = RANKING.apply(z);

         // The ranks for x is in the first x.length entries in ranks because x
         // is in the first x.length entries in z
         final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();

         // U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
         // e.g. x, n1 is the number of observations in sample 1.
         return sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
     }

     /**
      * Performs a Mann-Whitney U test comparing the location for two independent
      * samples. The location is specified using {@link #withMu(double) mu}.
      *
      * <p>The test is defined by the {@link AlternativeHypothesis}.
      * <ul>
      * <li>'two-sided': the distribution underlying {@code (x - mu)} is not equal to the
      * distribution underlying {@code y}.
      * <li>'greater': the distribution underlying {@code (x - mu)} is stochastically greater than
      * the distribution underlying {@code y}.
      * <li>'less': the distribution underlying {@code (x - mu)} is stochastically less than
      * the distribution underlying {@code y}.
      * </ul>
      *
      * <p>If the p-value method is {@linkplain PValueMethod#AUTO auto} an exact p-value is
      * computed if the samples contain less than 50 values; otherwise a normal
      * approximation is used.
      *
      * <p>Computation of the exact p-value is only valid if there are no tied
      * ranks in the data; otherwise the p-value resorts to the asymptotic
      * approximation using a tie correction and an optional continuity correction.
      *
      * <p><strong>Note: </strong>
      * Exact computation requires tabulation of values not exceeding size
      * {@code (n+1)*(m+1)*(u+1)} where {@code u} is the minimum of the U<sub>1</sub> and
      * U<sub>2</sub> statistics and {@code n} and {@code m} are the sample sizes.
      * This may use a very large amount of memory and result in an {@link OutOfMemoryError}.
      * Exact computation requires a finite binomial coefficient {@code binom(n+m, m)}
      * which is limited to {@code n+m <= 1029} for any {@code n} and {@code m},
      * or {@code min(n, m) <= 37} for any {@code max(n, m)}.
      * An {@link OutOfMemoryError} is not expected using the
      * limits configured for the {@linkplain PValueMethod#AUTO auto} p-value computation
      * as the maximum required memory is approximately 23 MiB.
      *
      * @param x First sample values.
      * @param y Second sample values.
      * @return test result
      * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
      * NaN values.
      * @throws OutOfMemoryError if the exact computation is <em>user-requested</em> for
      * large samples and there is not enough memory.
      * @see #statistic(double[], double[])
      * @see #withMu(double)
      * @see #with(AlternativeHypothesis)
      * @see #with(ContinuityCorrection)
      */
     public Result test(double[] x, double[] y) {
         // Computation as above. The ranks are required for tie correction.
         checkSamples(x, y);
         final double[] z = concatenateSamples(mu, x, y);
         final double[] ranks = RANKING.apply(z);
         final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
         final double u1 = sumRankX - ((long) x.length * (x.length + 1)) * 0.5;

         final double c = WilcoxonSignedRankTest.calculateTieCorrection(ranks);
         final boolean tiedValues = c != 0;

         PValueMethod method = pValueMethod;
         final int n = x.length;
         final int m = y.length;
         if (method == PValueMethod.AUTO && Math.max(n, m) < AUTO_LIMIT) {
             method = PValueMethod.EXACT;
         }
         // Exact p requires no ties.
         // The method will fail-fast if the computation is not possible due
         // to the size of the data.
         double p = method == PValueMethod.EXACT && !tiedValues ?
             calculateExactPValue(u1, n, m, alternative) : -1;
         if (p < 0) {
             p = calculateAsymptoticPValue(u1, n, m, c);
         }
         return new Result(u1, tiedValues, p);
     }

     /**
      * Ensures that the provided arrays fulfil the assumptions.
      *
      * @param x First sample values.
      * @param y Second sample values.
      * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length.
      */
     private static void checkSamples(double[] x, double[] y) {
         Arguments.checkValuesRequiredSize(x.length, 1);
         Arguments.checkValuesRequiredSize(y.length, 1);
     }

     /**
      * Concatenate the samples into one array. Subtract {@code mu} from the first sample.
      *
      * @param mu Expected difference between means.
      * @param x First sample values.
      * @param y Second sample values.
      * @return concatenated array
      */
     private static double[] concatenateSamples(double mu, double[] x, double[] y) {
         final double[] z = new double[x.length + y.length];
         System.arraycopy(x, 0, z, 0, x.length);
         System.arraycopy(y, 0, z, x.length, y.length);
         if (mu != 0) {
             for (int i = 0; i < x.length; i++) {
                 z[i] -= mu;
             }
         }
         return z;
     }

     /**
      * Calculate the asymptotic p-value using a Normal approximation.
      *
      * @param u Mann-Whitney U value.
      * @param n1 Number of subjects in first sample.
      * @param n2 Number of subjects in second sample.
      * @param c Tie-correction
      * @return two-sided asymptotic p-value
      */
     private double calculateAsymptoticPValue(double u, int n1, int n2, double c) {
         // Use long to avoid overflow
         final long n1n2 = (long) n1 * n2;
         final long n = (long) n1 + n2;

         // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
         final double e = n1n2 * 0.5;
         final double variance = (n1n2 / 12.0) * ((n + 1.0) - c / n / (n - 1));

         double z = u - e;
         if (continuityCorrection) {
             // +/- 0.5 is a continuity correction towards the expected.
             if (alternative == AlternativeHypothesis.GREATER_THAN) {
                 z -= 0.5;
             } else if (alternative == AlternativeHypothesis.LESS_THAN) {
                 z += 0.5;
             } else {
                 // two-sided. Shift towards the expected of zero.
                 // Use of signum ignores x==0 (i.e. not copySign(0.5, z))
                 z -= Math.signum(z) * 0.5;
             }
         }
         z /= Math.sqrt(variance);

         final NormalDistribution standardNormal = NormalDistribution.of(0, 1);
         if (alternative == AlternativeHypothesis.GREATER_THAN) {
             return standardNormal.survivalProbability(z);
         }
         if (alternative == AlternativeHypothesis.LESS_THAN) {
             return standardNormal.cumulativeProbability(z);
         }
         // two-sided
         return 2 * standardNormal.survivalProbability(Math.abs(z));
     }

     /**
      * Calculate the exact p-value. If the value cannot be computed this returns -1.
      *
      * <p>Note: Computation may run out of memory during array allocation, or method
      * recursion.
      *
      * @param u Mann-Whitney U value.
      * @param m Number of subjects in first sample.
      * @param n Number of subjects in second sample.
      * @param alternative Alternative hypothesis.
      * @return exact p-value (or -1) (two-sided, greater, or less using the options)
      */
     // package-private for testing
     static double calculateExactPValue(double u, int m, int n, AlternativeHypothesis alternative) {
         // Check the computation can be attempted.
         // u must be an integer
         if ((int) u != u) {
             return -1;
         }
         // Note: n+m will not overflow as we concatenated the samples to a single array.
         final double binom = BinomialCoefficientDouble.value(n + m, m);
         if (binom == Double.POSITIVE_INFINITY) {
             return -1;
         }

         // Use u_min for the CDF.
         final int u1 = (int) u;
         final int u2 = (int) ((long) m * n - u1);
         // Use m < n to support symmetry.
         final int n1 = Math.min(m, n);
         final int n2 = Math.max(m, n);

         // Return the correct side:
         if (alternative == AlternativeHypothesis.GREATER_THAN) {
             // sf(u1 - 1)
             return sf(u1 - 1, u2 + 1, n1, n2, binom);
         }
         if (alternative == AlternativeHypothesis.LESS_THAN) {
             // cdf(u1)
             return cdf(u1, u2, n1, n2, binom);
         }
         // two-sided: 2 * sf(max(u1, u2) - 1) or 2 * cdf(min(u1, u2))
         final double p = 2 * computeCdf(Math.min(u1, u2), n1, n2, binom);
         // Clip to range: [0, 1]
         return Math.min(1, p);
     }

     /**
      * Compute the cumulative density function of the Mann-Whitney U1 statistic.
      * The U2 statistic is passed for convenience to exploit symmetry in the distribution.
      *
      * @param u1 Mann-Whitney U1 statistic
      * @param u2 Mann-Whitney U2 statistic
      * @param m First sample size.
      * @param n Second sample size.
      * @param binom binom(n+m, m) (must be finite)
      * @return {@code Pr(X <= k)}
      */
     private static double cdf(int u1, int u2, int m, int n, double binom) {
         // Exploit symmetry. Note the distribution is discrete thus requiring (u2 - 1).
         return u2 > u1 ?
             computeCdf(u1, m, n, binom) :
             1 - computeCdf(u2 - 1, m, n, binom);
     }

     /**
      * Compute the survival function of the Mann-Whitney U1 statistic.
      * The U2 statistic is passed for convenience to exploit symmetry in the distribution.
      *
      * @param u1 Mann-Whitney U1 statistic
      * @param u2 Mann-Whitney U2 statistic
      * @param m First sample size.
      * @param n Second sample size.
      * @param binom binom(n+m, m) (must be finite)
      * @return {@code Pr(X > k)}
      */
     private static double sf(int u1, int u2, int m, int n, double binom) {
         // Opposite of the CDF
         return u2 > u1 ?
             1 - computeCdf(u1, m, n, binom) :
             computeCdf(u2 - 1, m, n, binom);
     }

     /**
      * Compute the cumulative density function of the Mann-Whitney U statistic.
      *
      * <p>This should be called with the lower of U1 or U2 for computational efficiency.
      *
      * <p>Uses the recursive formula provided in Bucchianico, A.D, (1999)
      * Combinatorics, computer algebra and the Wilcoxon-Mann-Whitney test, Journal
      * of Statistical Planning and Inference, Volume 79, Issue 2, 349-364.
      *
      * @param k Mann-Whitney U statistic
      * @param m First sample size.
      * @param n Second sample size.
      * @param binom binom(n+m, m) (must be finite)
      * @return {@code Pr(X <= k)}
      */
     private static double computeCdf(int k, int m, int n, double binom) {
         // Theorem 2.5:
         // f(m, n, k) = 0 if k < 0, m < 0, n < 0, k > nm
         if (k < 0) {
             return 0;
         }
         // Recursively compute f(m, n, k)
         final double[][][] f = getF(m, n, k);

         // P(X=k) = f(m, n, k) / binom(m+n, m)
         // P(X<=k) = sum_0^k (P(X=i))

         // Called with k = min(u1, u2) : max(p) ~ 0.5 so no need to clip to [0, 1]
         return IntStream.rangeClosed(0, k).mapToDouble(i -> fmnk(f, m, n, i)).sum() / binom;
     }

     /**
      * Gets the storage for f(m, n, k).
      *
      * <p>This may be cached for performance.
      *
      * @param m M.
      * @param n N.
      * @param k K.
      * @return the storage for f
      */
     private static double[][][] getF(int m, int n, int k) {
         // Obtain any previous computation of f and expand it if required.
         // F is only modified within this synchronized block.
         // Any concurrent threads using a reference returned by this method
         // will not receive an index out-of-bounds as f is only ever expanded.
         synchronized (LOCK) {
             // Note: f(x<m, y<n, z<k) is always the same.
             // Cache the array and re-use any previous computation.
             double[][][] f = cacheF.get();

             // Require:
             // f = new double[m + 1][n + 1][k + 1]
             // f(m, n, 0) == 1; otherwise -1 if not computed
             // m+n <= 1029 for any m,n; k < mn/2 (due to symmetry using min(u1, u2))
             // Size m=n=515: approximately 516^2 * 515^2/2 = 398868 doubles ~ 3.04 GiB
             if (f == null) {
                 f = new double[m + 1][n + 1][k + 1];
                 for (final double[][] a : f) {
                     for (final double[] b : a) {
                         initialize(b);
                     }
                 }
                 // Cache for reuse.
                 cacheF = new SoftReference<>(f);
                 return f;
             }

             // Grow if required: m1 < m+1 => m1-(m+1) < 0 => m1 - m < 1
             final int m1 = f.length;
             final int n1 = f[0].length;
             final int k1 = f[0][0].length;
             final boolean growM = m1 - m < 1;
             final boolean growN = n1 - n < 1;
             final boolean growK = k1 - k < 1;
             if (growM | growN | growK) {
                 // Some part of the previous f is too small.
                 // Atomically grow without destroying the previous computation.
                 // Any other thread using the previous f will not go out of bounds
                 // by keeping the new f dimensions at least as large.
                 // Note: Doing this in-place allows the memory to be gradually
                 // increased rather than allocating a new [m + 1][n + 1][k + 1]
                 // and copying all old values.
                 final int sn = Math.max(n1, n + 1);
                 final int sk = Math.max(k1, k + 1);
                 if (growM) {
                     // Entirely new region
                     f = Arrays.copyOf(f, m + 1);
                     for (int x = m1; x <= m; x++) {
                         f[x] = new double[sn][sk];
                         for (final double[] b : f[x]) {
                             initialize(b);
                         }
                     }
                 }
                 // Expand previous in place if required
                 if (growN) {
                     for (int x = 0; x < m1; x++) {
                         f[x] = Arrays.copyOf(f[x], sn);
                         for (int y = n1; y < sn; y++) {
                             final double[] b = f[x][y] = new double[sk];
                             initialize(b);
                         }
                     }
                 }
                 if (growK) {
                     for (int x = 0; x < m1; x++) {
                         for (int y = 0; y < n1; y++) {
                             final double[] b = f[x][y] = Arrays.copyOf(f[x][y], sk);
                             for (int z = k1; z < sk; z++) {
                                 b[z] = UNSET;
                             }
                         }
                     }
                 }
                 // Avoided an OutOfMemoryError. Cache for reuse.
                 cacheF = new SoftReference<>(f);
             }
             return f;
         }
     }

     /**
      * Initialize the array for f(m, n, x).
      * Set value to 1 for x=0; otherwise {@link #UNSET}.
      *
      * @param fmn Array.
      */
     private static void initialize(double[] fmn) {
         Arrays.fill(fmn, UNSET);
         // f(m, n, 0) == 1 if m >= 0, n >= 0
         fmn[0] = 1;
     }

     /**
      * Compute f(m; n; k), the number of subsets of {0; 1; ...; n} with m elements such
      * that the elements of this subset add up to k.
      *
      * <p>The function is computed recursively.
      *
      * @param f Tabulated values of f[m][n][k].
      * @param m M
      * @param n N
      * @param k K
      * @return f(m; n; k)
      */
     private static double fmnk(double[][][] f, int m, int n, int k) {
         // Theorem 2.5:
         // Omit conditions that will not be met: k > mn
         // f(m, n, k) = 0 if k < 0, m < 0, n < 0
         if ((k | m | n) < 0) {
             return 0;
         }
         // Compute on demand
         double fmnk = f[m][n][k];
         if (fmnk < 0) {
             // f(m, n, 0) == 1 if m >= 0, n >= 0
             // This is already computed.

             // Recursion from formula (3):
             // f(m, n, k) = f(m-1, n, k-n) + f(m, n-1, k)
             f[m][n][k] = fmnk = fmnk(f, m - 1, n, k - n) + fmnk(f, m, n - 1, k);
         }
         return fmnk;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.statistics.inference;

	import java.lang.ref.SoftReference;
	import java.util.Arrays;
	import java.util.EnumSet;
	import java.util.Objects;
	import java.util.stream.IntStream;
	import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
	import org.apache.commons.statistics.distribution.NormalDistribution;
	import org.apache.commons.statistics.ranking.NaNStrategy;
	import org.apache.commons.statistics.ranking.NaturalRanking;
	import org.apache.commons.statistics.ranking.RankingAlgorithm;
	import org.apache.commons.statistics.ranking.TiesStrategy;

	/**
	* Implements the Mann-Whitney U test (also called Wilcoxon rank-sum test).
	*
	* @see <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">
	* Mann-Whitney U test (Wikipedia)</a>
	* @since 1.1
	*/
	public final class MannWhitneyUTest {
	/** Limit on sample size for the exact p-value computation for the auto mode. */
	private static final int AUTO_LIMIT = 50;
	/** Ranking instance. */
	private static final RankingAlgorithm RANKING = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE);
	/** Value for an unset f computation. */
	private static final double UNSET = -1;
	/** An object to use for synchonization when accessing the cache of F. */
	private static final Object LOCK = new Object();
	/** A reference to a previously computed storage for f.
	* Use of a SoftReference ensures this is garbage collected before an OutOfMemoryError.
	* The value should only be accessed, checked for size and optionally
	* modified when holding the lock. When the storage is determined to be the correct
	* size it can be returned for read/write to the array when not holding the lock. */
	private static SoftReference<double[][][]> cacheF = new SoftReference<>(null);
	/** Default instance. */
	private static final MannWhitneyUTest DEFAULT = new MannWhitneyUTest(
	AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, true, 0);

	/** Alternative hypothesis. */
	private final AlternativeHypothesis alternative;
	/** Method to compute the p-value. */
	private final PValueMethod pValueMethod;
	/** Perform continuity correction. */
	private final boolean continuityCorrection;
	/** Expected location shift. */
	private final double mu;

	/**
	* Result for the Mann-Whitney U test.
	*
	* <p>This class is immutable.
	*
	* @since 1.1
	*/
	public static final class Result extends BaseSignificanceResult {
	/** Flag indicating the data has tied values. */
	private final boolean tiedValues;

	/**
	* Create an instance.
	*
	* @param statistic Test statistic.
	* @param tiedValues Flag indicating the data has tied values.
	* @param p Result p-value.
	*/
	Result(double statistic, boolean tiedValues, double p) {
	super(statistic, p);
	this.tiedValues = tiedValues;
	}

	/**
	* {@inheritDoc}
	*
	* <p>This is the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using
	* the original sample lengths {@code n} and {@code m} using:
	* <pre>
	* u2 = (long) n * m - u1;
	* </pre>
	*/
	@Override
	public double getStatistic() {
	// Note: This method is here for documentation
	return super.getStatistic();
	}

	/**
	* Return {@code true} if the data had tied values.
	*
	* <p>Note: The exact computation cannot be used when there are tied values.
	*
	* @return {@code true} if there were tied values
	*/
	public boolean hasTiedValues() {
	return tiedValues;
	}
	}

	/**
	* @param alternative Alternative hypothesis.
	* @param method P-value method.
	* @param continuityCorrection true to perform continuity correction.
	* @param mu Expected location shift.
	*/
	private MannWhitneyUTest(AlternativeHypothesis alternative, PValueMethod method,
	boolean continuityCorrection, double mu) {
	this.alternative = alternative;
	this.pValueMethod = method;
	this.continuityCorrection = continuityCorrection;
	this.mu = mu;
	}

	/**
	* Return an instance using the default options.
	*
	* <ul>
	* <li>{@link AlternativeHypothesis#TWO_SIDED}
	* <li>{@link PValueMethod#AUTO}
	* <li>{@link ContinuityCorrection#ENABLED}
	* <li>{@linkplain #withMu(double) mu = 0}
	* </ul>
	*
	* @return default instance
	*/
	public static MannWhitneyUTest withDefaults() {
	return DEFAULT;
	}

	/**
	* Return an instance with the configured alternative hypothesis.
	*
	* @param v Value.
	* @return an instance
	*/
	public MannWhitneyUTest with(AlternativeHypothesis v) {
	return new MannWhitneyUTest(Objects.requireNonNull(v), pValueMethod, continuityCorrection, mu);
	}

	/**
	* Return an instance with the configured p-value method.
	*
	* @param v Value.
	* @return an instance
	* @throws IllegalArgumentException if the value is not in the allowed options or is null
	*/
	public MannWhitneyUTest with(PValueMethod v) {
	return new MannWhitneyUTest(alternative,
	Arguments.checkOption(v, EnumSet.of(PValueMethod.AUTO, PValueMethod.EXACT, PValueMethod.ASYMPTOTIC)),
	continuityCorrection, mu);
	}

	/**
	* Return an instance with the configured continuity correction.
	*
	* <p>If {@link ContinuityCorrection#ENABLED ENABLED}, adjust the U rank statistic by
	* 0.5 towards the mean value when computing the z-statistic if a normal approximation is used
	* to compute the p-value.
	*
	* @param v Value.
	* @return an instance
	*/
	public MannWhitneyUTest with(ContinuityCorrection v) {
	return new MannWhitneyUTest(alternative, pValueMethod,
	Objects.requireNonNull(v) == ContinuityCorrection.ENABLED, mu);
	}

	/**
	* Return an instance with the configured location shift {@code mu}.
	*
	* @param v Value.
	* @return an instance
	* @throws IllegalArgumentException if the value is not finite
	*/
	public MannWhitneyUTest withMu(double v) {
	return new MannWhitneyUTest(alternative, pValueMethod, continuityCorrection, Arguments.checkFinite(v));
	}

	/**
	* Computes the Mann-Whitney U statistic comparing two independent
	* samples possibly of different length.
	*
	* <p>This statistic can be used to perform a Mann-Whitney U test evaluating the
	* null hypothesis that the two independent samples differ by a location shift of {@code mu}.
	*
	* <p>This returns the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using:
	* <pre>
	* u2 = (long) x.length * y.length - u1;
	* </pre>
	*
	* @param x First sample values.
	* @param y Second sample values.
	* @return Mann-Whitney U<sub>1</sub> statistic
	* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
	* NaN values.
	* @see #withMu(double)
	*/
	public double statistic(double[] x, double[] y) {
	checkSamples(x, y);

	final double[] z = concatenateSamples(mu, x, y);
	final double[] ranks = RANKING.apply(z);

	// The ranks for x is in the first x.length entries in ranks because x
	// is in the first x.length entries in z
	final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();

	// U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
	// e.g. x, n1 is the number of observations in sample 1.
	return sumRankX - ((long) x.length * (x.length + 1)) * 0.5;
	}

	/**
	* Performs a Mann-Whitney U test comparing the location for two independent
	* samples. The location is specified using {@link #withMu(double) mu}.
	*
	* <p>The test is defined by the {@link AlternativeHypothesis}.
	* <ul>
	* <li>'two-sided': the distribution underlying {@code (x - mu)} is not equal to the
	* distribution underlying {@code y}.
	* <li>'greater': the distribution underlying {@code (x - mu)} is stochastically greater than
	* the distribution underlying {@code y}.
	* <li>'less': the distribution underlying {@code (x - mu)} is stochastically less than
	* the distribution underlying {@code y}.
	* </ul>
	*
	* <p>If the p-value method is {@linkplain PValueMethod#AUTO auto} an exact p-value is
	* computed if the samples contain less than 50 values; otherwise a normal
	* approximation is used.
	*
	* <p>Computation of the exact p-value is only valid if there are no tied
	* ranks in the data; otherwise the p-value resorts to the asymptotic
	* approximation using a tie correction and an optional continuity correction.
	*
	* <p><strong>Note: </strong>
	* Exact computation requires tabulation of values not exceeding size
	* {@code (n+1)(m+1)(u+1)} where {@code u} is the minimum of the U<sub>1</sub> and
	* U<sub>2</sub> statistics and {@code n} and {@code m} are the sample sizes.
	* This may use a very large amount of memory and result in an {@link OutOfMemoryError}.
	* Exact computation requires a finite binomial coefficient {@code binom(n+m, m)}
	* which is limited to {@code n+m <= 1029} for any {@code n} and {@code m},
	* or {@code min(n, m) <= 37} for any {@code max(n, m)}.
	* An {@link OutOfMemoryError} is not expected using the
	* limits configured for the {@linkplain PValueMethod#AUTO auto} p-value computation
	* as the maximum required memory is approximately 23 MiB.
	*
	* @param x First sample values.
	* @param y Second sample values.
	* @return test result
	* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain
	* NaN values.
	* @throws OutOfMemoryError if the exact computation is <em>user-requested</em> for
	* large samples and there is not enough memory.
	* @see #statistic(double[], double[])
	* @see #withMu(double)
	* @see #with(AlternativeHypothesis)
	* @see #with(ContinuityCorrection)
	*/
	public Result test(double[] x, double[] y) {
	// Computation as above. The ranks are required for tie correction.
	checkSamples(x, y);
	final double[] z = concatenateSamples(mu, x, y);
	final double[] ranks = RANKING.apply(z);
	final double sumRankX = Arrays.stream(ranks).limit(x.length).sum();
	final double u1 = sumRankX - ((long) x.length * (x.length + 1)) * 0.5;

	final double c = WilcoxonSignedRankTest.calculateTieCorrection(ranks);
	final boolean tiedValues = c != 0;

	PValueMethod method = pValueMethod;
	final int n = x.length;
	final int m = y.length;
	if (method == PValueMethod.AUTO && Math.max(n, m) < AUTO_LIMIT) {
	method = PValueMethod.EXACT;
	}
	// Exact p requires no ties.
	// The method will fail-fast if the computation is not possible due
	// to the size of the data.
	double p = method == PValueMethod.EXACT && !tiedValues ?
	calculateExactPValue(u1, n, m, alternative) : -1;
	if (p < 0) {
	p = calculateAsymptoticPValue(u1, n, m, c);
	}
	return new Result(u1, tiedValues, p);
	}

	/**
	* Ensures that the provided arrays fulfil the assumptions.
	*
	* @param x First sample values.
	* @param y Second sample values.
	* @throws IllegalArgumentException if {@code x} or {@code y} are zero-length.
	*/
	private static void checkSamples(double[] x, double[] y) {
	Arguments.checkValuesRequiredSize(x.length, 1);
	Arguments.checkValuesRequiredSize(y.length, 1);
	}

	/**
	* Concatenate the samples into one array. Subtract {@code mu} from the first sample.
	*
	* @param mu Expected difference between means.
	* @param x First sample values.
	* @param y Second sample values.
	* @return concatenated array
	*/
	private static double[] concatenateSamples(double mu, double[] x, double[] y) {
	final double[] z = new double[x.length + y.length];
	System.arraycopy(x, 0, z, 0, x.length);
	System.arraycopy(y, 0, z, x.length, y.length);
	if (mu != 0) {
	for (int i = 0; i < x.length; i++) {
	z[i] -= mu;
	}
	}
	return z;
	}

	/**
	* Calculate the asymptotic p-value using a Normal approximation.
	*
	* @param u Mann-Whitney U value.
	* @param n1 Number of subjects in first sample.
	* @param n2 Number of subjects in second sample.
	* @param c Tie-correction
	* @return two-sided asymptotic p-value
	*/
	private double calculateAsymptoticPValue(double u, int n1, int n2, double c) {
	// Use long to avoid overflow
	final long n1n2 = (long) n1 * n2;
	final long n = (long) n1 + n2;

	// https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
	final double e = n1n2 * 0.5;
	final double variance = (n1n2 / 12.0) * ((n + 1.0) - c / n / (n - 1));

	double z = u - e;
	if (continuityCorrection) {
	// +/- 0.5 is a continuity correction towards the expected.
	if (alternative == AlternativeHypothesis.GREATER_THAN) {
	z -= 0.5;
	} else if (alternative == AlternativeHypothesis.LESS_THAN) {
	z += 0.5;
	} else {
	// two-sided. Shift towards the expected of zero.
	// Use of signum ignores x==0 (i.e. not copySign(0.5, z))
	z -= Math.signum(z) * 0.5;
	}
	}
	z /= Math.sqrt(variance);

	final NormalDistribution standardNormal = NormalDistribution.of(0, 1);
	if (alternative == AlternativeHypothesis.GREATER_THAN) {
	return standardNormal.survivalProbability(z);
	}
	if (alternative == AlternativeHypothesis.LESS_THAN) {
	return standardNormal.cumulativeProbability(z);
	}
	// two-sided
	return 2 * standardNormal.survivalProbability(Math.abs(z));
	}

	/**
	* Calculate the exact p-value. If the value cannot be computed this returns -1.
	*
	* <p>Note: Computation may run out of memory during array allocation, or method
	* recursion.
	*
	* @param u Mann-Whitney U value.
	* @param m Number of subjects in first sample.
	* @param n Number of subjects in second sample.
	* @param alternative Alternative hypothesis.
	* @return exact p-value (or -1) (two-sided, greater, or less using the options)
	*/
	// package-private for testing
	static double calculateExactPValue(double u, int m, int n, AlternativeHypothesis alternative) {
	// Check the computation can be attempted.
	// u must be an integer
	if ((int) u != u) {
	return -1;
	}
	// Note: n+m will not overflow as we concatenated the samples to a single array.
	final double binom = BinomialCoefficientDouble.value(n + m, m);
	if (binom == Double.POSITIVE_INFINITY) {
	return -1;
	}

	// Use u_min for the CDF.
	final int u1 = (int) u;
	final int u2 = (int) ((long) m * n - u1);
	// Use m < n to support symmetry.
	final int n1 = Math.min(m, n);
	final int n2 = Math.max(m, n);

	// Return the correct side:
	if (alternative == AlternativeHypothesis.GREATER_THAN) {
	// sf(u1 - 1)
	return sf(u1 - 1, u2 + 1, n1, n2, binom);
	}
	if (alternative == AlternativeHypothesis.LESS_THAN) {
	// cdf(u1)
	return cdf(u1, u2, n1, n2, binom);
	}
	// two-sided: 2 * sf(max(u1, u2) - 1) or 2 * cdf(min(u1, u2))
	final double p = 2 * computeCdf(Math.min(u1, u2), n1, n2, binom);
	// Clip to range: [0, 1]
	return Math.min(1, p);
	}

	/**
	* Compute the cumulative density function of the Mann-Whitney U1 statistic.
	* The U2 statistic is passed for convenience to exploit symmetry in the distribution.
	*
	* @param u1 Mann-Whitney U1 statistic
	* @param u2 Mann-Whitney U2 statistic
	* @param m First sample size.
	* @param n Second sample size.
	* @param binom binom(n+m, m) (must be finite)
	* @return {@code Pr(X <= k)}
	*/
	private static double cdf(int u1, int u2, int m, int n, double binom) {
	// Exploit symmetry. Note the distribution is discrete thus requiring (u2 - 1).
	return u2 > u1 ?
	computeCdf(u1, m, n, binom) :
	1 - computeCdf(u2 - 1, m, n, binom);
	}

	/**
	* Compute the survival function of the Mann-Whitney U1 statistic.
	* The U2 statistic is passed for convenience to exploit symmetry in the distribution.
	*
	* @param u1 Mann-Whitney U1 statistic
	* @param u2 Mann-Whitney U2 statistic
	* @param m First sample size.
	* @param n Second sample size.
	* @param binom binom(n+m, m) (must be finite)
	* @return {@code Pr(X > k)}
	*/
	private static double sf(int u1, int u2, int m, int n, double binom) {
	// Opposite of the CDF
	return u2 > u1 ?
	1 - computeCdf(u1, m, n, binom) :
	computeCdf(u2 - 1, m, n, binom);
	}

	/**
	* Compute the cumulative density function of the Mann-Whitney U statistic.
	*
	* <p>This should be called with the lower of U1 or U2 for computational efficiency.
	*
	* <p>Uses the recursive formula provided in Bucchianico, A.D, (1999)
	* Combinatorics, computer algebra and the Wilcoxon-Mann-Whitney test, Journal
	* of Statistical Planning and Inference, Volume 79, Issue 2, 349-364.
	*
	* @param k Mann-Whitney U statistic
	* @param m First sample size.
	* @param n Second sample size.
	* @param binom binom(n+m, m) (must be finite)
	* @return {@code Pr(X <= k)}
	*/
	private static double computeCdf(int k, int m, int n, double binom) {
	// Theorem 2.5:
	// f(m, n, k) = 0 if k < 0, m < 0, n < 0, k > nm
	if (k < 0) {
	return 0;
	}
	// Recursively compute f(m, n, k)
	final double[][][] f = getF(m, n, k);

	// P(X=k) = f(m, n, k) / binom(m+n, m)
	// P(X<=k) = sum_0^k (P(X=i))

	// Called with k = min(u1, u2) : max(p) ~ 0.5 so no need to clip to [0, 1]
	return IntStream.rangeClosed(0, k).mapToDouble(i -> fmnk(f, m, n, i)).sum() / binom;
	}

	/**
	* Gets the storage for f(m, n, k).
	*
	* <p>This may be cached for performance.
	*
	* @param m M.
	* @param n N.
	* @param k K.
	* @return the storage for f
	*/
	private static double[][][] getF(int m, int n, int k) {
	// Obtain any previous computation of f and expand it if required.
	// F is only modified within this synchronized block.
	// Any concurrent threads using a reference returned by this method
	// will not receive an index out-of-bounds as f is only ever expanded.
	synchronized (LOCK) {
	// Note: f(x<m, y<n, z<k) is always the same.
	// Cache the array and re-use any previous computation.
	double[][][] f = cacheF.get();

	// Require:
	// f = new double[m + 1][n + 1][k + 1]
	// f(m, n, 0) == 1; otherwise -1 if not computed
	// m+n <= 1029 for any m,n; k < mn/2 (due to symmetry using min(u1, u2))
	// Size m=n=515: approximately 516^2 * 515^2/2 = 398868 doubles ~ 3.04 GiB
	if (f == null) {
	f = new double[m + 1][n + 1][k + 1];
	for (final double[][] a : f) {
	for (final double[] b : a) {
	initialize(b);
	}
	}
	// Cache for reuse.
	cacheF = new SoftReference<>(f);
	return f;
	}

	// Grow if required: m1 < m+1 => m1-(m+1) < 0 => m1 - m < 1
	final int m1 = f.length;
	final int n1 = f[0].length;
	final int k1 = f[0][0].length;
	final boolean growM = m1 - m < 1;
	final boolean growN = n1 - n < 1;
	final boolean growK = k1 - k < 1;
	if (growM \| growN \| growK) {
	// Some part of the previous f is too small.
	// Atomically grow without destroying the previous computation.
	// Any other thread using the previous f will not go out of bounds
	// by keeping the new f dimensions at least as large.
	// Note: Doing this in-place allows the memory to be gradually
	// increased rather than allocating a new [m + 1][n + 1][k + 1]
	// and copying all old values.
	final int sn = Math.max(n1, n + 1);
	final int sk = Math.max(k1, k + 1);
	if (growM) {
	// Entirely new region
	f = Arrays.copyOf(f, m + 1);
	for (int x = m1; x <= m; x++) {
	f[x] = new double[sn][sk];
	for (final double[] b : f[x]) {
	initialize(b);
	}
	}
	}
	// Expand previous in place if required
	if (growN) {
	for (int x = 0; x < m1; x++) {
	f[x] = Arrays.copyOf(f[x], sn);
	for (int y = n1; y < sn; y++) {
	final double[] b = f[x][y] = new double[sk];
	initialize(b);
	}
	}
	}
	if (growK) {
	for (int x = 0; x < m1; x++) {
	for (int y = 0; y < n1; y++) {
	final double[] b = f[x][y] = Arrays.copyOf(f[x][y], sk);
	for (int z = k1; z < sk; z++) {
	b[z] = UNSET;
	}
	}
	}
	}
	// Avoided an OutOfMemoryError. Cache for reuse.
	cacheF = new SoftReference<>(f);
	}
	return f;
	}
	}

	/**
	* Initialize the array for f(m, n, x).
	* Set value to 1 for x=0; otherwise {@link #UNSET}.
	*
	* @param fmn Array.
	*/
	private static void initialize(double[] fmn) {
	Arrays.fill(fmn, UNSET);
	// f(m, n, 0) == 1 if m >= 0, n >= 0
	fmn[0] = 1;
	}

	/**
	* Compute f(m; n; k), the number of subsets of {0; 1; ...; n} with m elements such
	* that the elements of this subset add up to k.
	*
	* <p>The function is computed recursively.
	*
	* @param f Tabulated values of f[m][n][k].
	* @param m M
	* @param n N
	* @param k K
	* @return f(m; n; k)
	*/
	private static double fmnk(double[][][] f, int m, int n, int k) {
	// Theorem 2.5:
	// Omit conditions that will not be met: k > mn
	// f(m, n, k) = 0 if k < 0, m < 0, n < 0
	if ((k \| m \| n) < 0) {
	return 0;
	}
	// Compute on demand
	double fmnk = f[m][n][k];
	if (fmnk < 0) {
	// f(m, n, 0) == 1 if m >= 0, n >= 0
	// This is already computed.

	// Recursion from formula (3):
	// f(m, n, k) = f(m-1, n, k-n) + f(m, n-1, k)
	f[m][n][k] = fmnk = fmnk(f, m - 1, n, k - n) + fmnk(f, m, n - 1, k);
	}
	return fmnk;
	}
	}