| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.math4.stat.correlation; |
| |
| import org.apache.commons.statistics.distribution.TDistribution; |
| import org.apache.commons.math4.exception.DimensionMismatchException; |
| import org.apache.commons.math4.exception.MathIllegalArgumentException; |
| import org.apache.commons.math4.exception.NullArgumentException; |
| import org.apache.commons.math4.exception.util.LocalizedFormats; |
| import org.apache.commons.math4.linear.BlockRealMatrix; |
| import org.apache.commons.math4.linear.RealMatrix; |
| import org.apache.commons.math4.stat.regression.SimpleRegression; |
| import org.apache.commons.math4.util.FastMath; |
| |
| /** |
| * Computes Pearson's product-moment correlation coefficients for pairs of arrays |
| * or columns of a matrix. |
| * |
| * <p>The constructors that take <code>RealMatrix</code> or |
| * <code>double[][]</code> arguments generate correlation matrices. The |
| * columns of the input matrices are assumed to represent variable values. |
| * Correlations are given by the formula</p> |
| * |
| * <p><code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> |
| * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> |
| * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p> |
| * |
| * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()} |
| * to construct an instance with no data and then {@link #correlation(double[], double[])}. |
| * Correlation matrices can also be computed directly from an instance with no data using |
| * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()}, |
| * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the |
| * constructors supplying data or a covariance matrix must be used to create the instance.</p> |
| * |
| * @since 2.0 |
| */ |
| public class PearsonsCorrelation { |
| |
| /** correlation matrix */ |
| private final RealMatrix correlationMatrix; |
| |
| /** number of observations */ |
| private final int nObs; |
| |
| /** |
| * Create a PearsonsCorrelation instance without data. |
| */ |
| public PearsonsCorrelation() { |
| super(); |
| correlationMatrix = null; |
| nObs = 0; |
| } |
| |
| /** |
| * Create a PearsonsCorrelation from a rectangular array |
| * whose columns represent values of variables to be correlated. |
| * |
| * Throws MathIllegalArgumentException if the input array does not have at least |
| * two columns and two rows. Pairwise correlations are set to NaN if one |
| * of the correlates has zero variance. |
| * |
| * @param data rectangular array with columns representing variables |
| * @throws MathIllegalArgumentException if the input data array is not |
| * rectangular with at least two rows and two columns. |
| * @see #correlation(double[], double[]) |
| */ |
| public PearsonsCorrelation(double[][] data) { |
| this(new BlockRealMatrix(data)); |
| } |
| |
| /** |
| * Create a PearsonsCorrelation from a RealMatrix whose columns |
| * represent variables to be correlated. |
| * |
| * Throws MathIllegalArgumentException if the matrix does not have at least |
| * two columns and two rows. Pairwise correlations are set to NaN if one |
| * of the correlates has zero variance. |
| * |
| * @param matrix matrix with columns representing variables to correlate |
| * @throws MathIllegalArgumentException if the matrix does not contain sufficient data |
| * @see #correlation(double[], double[]) |
| */ |
| public PearsonsCorrelation(RealMatrix matrix) { |
| nObs = matrix.getRowDimension(); |
| correlationMatrix = computeCorrelationMatrix(matrix); |
| } |
| |
| /** |
| * Create a PearsonsCorrelation from a {@link Covariance}. The correlation |
| * matrix is computed by scaling the Covariance's covariance matrix. |
| * The Covariance instance must have been created from a data matrix with |
| * columns representing variable values. |
| * |
| * @param covariance Covariance instance |
| */ |
| public PearsonsCorrelation(Covariance covariance) { |
| RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); |
| if (covarianceMatrix == null) { |
| throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); |
| } |
| nObs = covariance.getN(); |
| correlationMatrix = covarianceToCorrelation(covarianceMatrix); |
| } |
| |
| /** |
| * Create a PearsonsCorrelation from a covariance matrix. The correlation |
| * matrix is computed by scaling the covariance matrix. |
| * |
| * @param covarianceMatrix covariance matrix |
| * @param numberOfObservations the number of observations in the dataset used to compute |
| * the covariance matrix |
| */ |
| public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { |
| nObs = numberOfObservations; |
| correlationMatrix = covarianceToCorrelation(covarianceMatrix); |
| } |
| |
| /** |
| * Returns the correlation matrix. |
| * |
| * <p>This method will return null if the non-argument constructor was used |
| * to create this instance, even if {@link #computeCorrelationMatrix(double[][])} |
| * has been called before it is activated.</p> |
| * |
| * @return correlation matrix |
| */ |
| public RealMatrix getCorrelationMatrix() { |
| return correlationMatrix; |
| } |
| |
| /** |
| * Returns a matrix of standard errors associated with the estimates |
| * in the correlation matrix.<br> |
| * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard |
| * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> |
| * |
| * <p>The formula used to compute the standard error is <br> |
| * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> |
| * where <code>r</code> is the estimated correlation coefficient and |
| * <code>n</code> is the number of observations in the source dataset.</p> |
| * |
| * <p>To use this method, one of the constructors that supply an input |
| * matrix must have been used to create this instance.</p> |
| * |
| * @return matrix of correlation standard errors |
| * @throws NullPointerException if this instance was created with no data |
| */ |
| public RealMatrix getCorrelationStandardErrors() { |
| int nVars = correlationMatrix.getColumnDimension(); |
| double[][] out = new double[nVars][nVars]; |
| for (int i = 0; i < nVars; i++) { |
| for (int j = 0; j < nVars; j++) { |
| double r = correlationMatrix.getEntry(i, j); |
| out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); |
| } |
| } |
| return new BlockRealMatrix(out); |
| } |
| |
| /** |
| * Returns a matrix of p-values associated with the (two-sided) null |
| * hypothesis that the corresponding correlation coefficient is zero. |
| * |
| * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability |
| * that a random variable distributed as <code>t<sub>n-2</sub></code> takes |
| * a value with absolute value greater than or equal to <br> |
| * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> |
| * |
| * <p>The values in the matrix are sometimes referred to as the |
| * <i>significance</i> of the corresponding correlation coefficients.</p> |
| * |
| * <p>To use this method, one of the constructors that supply an input |
| * matrix must have been used to create this instance.</p> |
| * |
| * @return matrix of p-values |
| * @throws org.apache.commons.math4.exception.MaxCountExceededException |
| * if an error occurs estimating probabilities |
| * @throws NullPointerException if this instance was created with no data |
| */ |
| public RealMatrix getCorrelationPValues() { |
| TDistribution tDistribution = new TDistribution(nObs - 2); |
| int nVars = correlationMatrix.getColumnDimension(); |
| double[][] out = new double[nVars][nVars]; |
| for (int i = 0; i < nVars; i++) { |
| for (int j = 0; j < nVars; j++) { |
| if (i == j) { |
| out[i][j] = 0d; |
| } else { |
| double r = correlationMatrix.getEntry(i, j); |
| double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); |
| out[i][j] = 2 * tDistribution.cumulativeProbability(-t); |
| } |
| } |
| } |
| return new BlockRealMatrix(out); |
| } |
| |
| |
| /** |
| * Computes the correlation matrix for the columns of the |
| * input matrix, using {@link #correlation(double[], double[])}. |
| * |
| * Throws MathIllegalArgumentException if the matrix does not have at least |
| * two columns and two rows. Pairwise correlations are set to NaN if one |
| * of the correlates has zero variance. |
| * |
| * @param matrix matrix with columns representing variables to correlate |
| * @return correlation matrix |
| * @throws MathIllegalArgumentException if the matrix does not contain sufficient data |
| * @see #correlation(double[], double[]) |
| */ |
| public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { |
| checkSufficientData(matrix); |
| int nVars = matrix.getColumnDimension(); |
| RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); |
| for (int i = 0; i < nVars; i++) { |
| for (int j = 0; j < i; j++) { |
| double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); |
| outMatrix.setEntry(i, j, corr); |
| outMatrix.setEntry(j, i, corr); |
| } |
| outMatrix.setEntry(i, i, 1d); |
| } |
| return outMatrix; |
| } |
| |
| /** |
| * Computes the correlation matrix for the columns of the |
| * input rectangular array. The columns of the array represent values |
| * of variables to be correlated. |
| * |
| * Throws MathIllegalArgumentException if the matrix does not have at least |
| * two columns and two rows or if the array is not rectangular. Pairwise |
| * correlations are set to NaN if one of the correlates has zero variance. |
| * |
| * @param data matrix with columns representing variables to correlate |
| * @return correlation matrix |
| * @throws MathIllegalArgumentException if the array does not contain sufficient data |
| * @see #correlation(double[], double[]) |
| */ |
| public RealMatrix computeCorrelationMatrix(double[][] data) { |
| return computeCorrelationMatrix(new BlockRealMatrix(data)); |
| } |
| |
| /** |
| * Computes the Pearson's product-moment correlation coefficient between two arrays. |
| * |
| * <p>Throws MathIllegalArgumentException if the arrays do not have the same length |
| * or their common length is less than 2. Returns {@code NaN} if either of the arrays |
| * has zero variance (i.e., if one of the arrays does not contain at least two distinct |
| * values).</p> |
| * |
| * @param xArray first data array |
| * @param yArray second data array |
| * @return Returns Pearson's correlation coefficient for the two arrays |
| * @throws DimensionMismatchException if the arrays lengths do not match |
| * @throws MathIllegalArgumentException if there is insufficient data |
| */ |
| public double correlation(final double[] xArray, final double[] yArray) { |
| SimpleRegression regression = new SimpleRegression(); |
| if (xArray.length != yArray.length) { |
| throw new DimensionMismatchException(xArray.length, yArray.length); |
| } else if (xArray.length < 2) { |
| throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, |
| xArray.length, 2); |
| } else { |
| for(int i=0; i<xArray.length; i++) { |
| regression.addData(xArray[i], yArray[i]); |
| } |
| return regression.getR(); |
| } |
| } |
| |
| /** |
| * Derives a correlation matrix from a covariance matrix. |
| * |
| * <p>Uses the formula <br> |
| * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where |
| * <code>r(·,·)</code> is the correlation coefficient and |
| * <code>s(·)</code> means standard deviation.</p> |
| * |
| * @param covarianceMatrix the covariance matrix |
| * @return correlation matrix |
| */ |
| public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { |
| int nVars = covarianceMatrix.getColumnDimension(); |
| RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); |
| for (int i = 0; i < nVars; i++) { |
| double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); |
| outMatrix.setEntry(i, i, 1d); |
| for (int j = 0; j < i; j++) { |
| double entry = covarianceMatrix.getEntry(i, j) / |
| (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); |
| outMatrix.setEntry(i, j, entry); |
| outMatrix.setEntry(j, i, entry); |
| } |
| } |
| return outMatrix; |
| } |
| |
| /** |
| * Throws MathIllegalArgumentException if the matrix does not have at least |
| * two columns and two rows. |
| * |
| * @param matrix matrix to check for sufficiency |
| * @throws MathIllegalArgumentException if there is insufficient data |
| */ |
| private void checkSufficientData(final RealMatrix matrix) { |
| int nRows = matrix.getRowDimension(); |
| int nCols = matrix.getColumnDimension(); |
| if (nRows < 2 || nCols < 2) { |
| throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, |
| nRows, nCols); |
| } |
| } |
| } |