diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java')
-rw-r--r-- | src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java b/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java new file mode 100644 index 0000000..53d17ab --- /dev/null +++ b/src/main/java/org/apache/commons/math3/stat/correlation/PearsonsCorrelation.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.stat.correlation; + +import org.apache.commons.math3.distribution.TDistribution; +import org.apache.commons.math3.exception.util.LocalizedFormats; +import org.apache.commons.math3.exception.MathIllegalArgumentException; +import org.apache.commons.math3.exception.NullArgumentException; +import org.apache.commons.math3.exception.DimensionMismatchException; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.BlockRealMatrix; +import org.apache.commons.math3.stat.regression.SimpleRegression; +import org.apache.commons.math3.util.FastMath; + +/** + * Computes Pearson's product-moment correlation coefficients for pairs of arrays + * or columns of a matrix. + * + * <p>The constructors that take <code>RealMatrix</code> or + * <code>double[][]</code> arguments generate correlation matrices. The + * columns of the input matrices are assumed to represent variable values. + * Correlations are given by the formula</p> + * + * <p><code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> + * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> + * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p> + * + * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()} + * to construct an instance with no data and then {@link #correlation(double[], double[])}. + * Correlation matrices can also be computed directly from an instance with no data using + * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()}, + * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the + * constructors supplying data or a covariance matrix must be used to create the instance.</p> + * + * @since 2.0 + */ +public class PearsonsCorrelation { + + /** correlation matrix */ + private final RealMatrix correlationMatrix; + + /** number of observations */ + private final int nObs; + + /** + * Create a PearsonsCorrelation instance without data. + */ + public PearsonsCorrelation() { + super(); + correlationMatrix = null; + nObs = 0; + } + + /** + * Create a PearsonsCorrelation from a rectangular array + * whose columns represent values of variables to be correlated. + * + * Throws MathIllegalArgumentException if the input array does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param data rectangular array with columns representing variables + * @throws MathIllegalArgumentException if the input data array is not + * rectangular with at least two rows and two columns. + * @see #correlation(double[], double[]) + */ + public PearsonsCorrelation(double[][] data) { + this(new BlockRealMatrix(data)); + } + + /** + * Create a PearsonsCorrelation from a RealMatrix whose columns + * represent variables to be correlated. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param matrix matrix with columns representing variables to correlate + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public PearsonsCorrelation(RealMatrix matrix) { + nObs = matrix.getRowDimension(); + correlationMatrix = computeCorrelationMatrix(matrix); + } + + /** + * Create a PearsonsCorrelation from a {@link Covariance}. The correlation + * matrix is computed by scaling the Covariance's covariance matrix. + * The Covariance instance must have been created from a data matrix with + * columns representing variable values. + * + * @param covariance Covariance instance + */ + public PearsonsCorrelation(Covariance covariance) { + RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); + if (covarianceMatrix == null) { + throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); + } + nObs = covariance.getN(); + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + } + + /** + * Create a PearsonsCorrelation from a covariance matrix. The correlation + * matrix is computed by scaling the covariance matrix. + * + * @param covarianceMatrix covariance matrix + * @param numberOfObservations the number of observations in the dataset used to compute + * the covariance matrix + */ + public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { + nObs = numberOfObservations; + correlationMatrix = covarianceToCorrelation(covarianceMatrix); + } + + /** + * Returns the correlation matrix. + * + * <p>This method will return null if the argumentless constructor was used + * to create this instance, even if {@link #computeCorrelationMatrix(double[][])} + * has been called before it is activated.</p> + * + * @return correlation matrix + */ + public RealMatrix getCorrelationMatrix() { + return correlationMatrix; + } + + /** + * Returns a matrix of standard errors associated with the estimates + * in the correlation matrix.<br/> + * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard + * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> + * + * <p>The formula used to compute the standard error is <br/> + * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> + * where <code>r</code> is the estimated correlation coefficient and + * <code>n</code> is the number of observations in the source dataset.</p> + * + * <p>To use this method, one of the constructors that supply an input + * matrix must have been used to create this instance.</p> + * + * @return matrix of correlation standard errors + * @throws NullPointerException if this instance was created with no data + */ + public RealMatrix getCorrelationStandardErrors() { + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + double r = correlationMatrix.getEntry(i, j); + out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); + } + } + return new BlockRealMatrix(out); + } + + /** + * Returns a matrix of p-values associated with the (two-sided) null + * hypothesis that the corresponding correlation coefficient is zero. + * + * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability + * that a random variable distributed as <code>t<sub>n-2</sub></code> takes + * a value with absolute value greater than or equal to <br> + * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> + * + * <p>The values in the matrix are sometimes referred to as the + * <i>significance</i> of the corresponding correlation coefficients.</p> + * + * <p>To use this method, one of the constructors that supply an input + * matrix must have been used to create this instance.</p> + * + * @return matrix of p-values + * @throws org.apache.commons.math3.exception.MaxCountExceededException + * if an error occurs estimating probabilities + * @throws NullPointerException if this instance was created with no data + */ + public RealMatrix getCorrelationPValues() { + TDistribution tDistribution = new TDistribution(nObs - 2); + int nVars = correlationMatrix.getColumnDimension(); + double[][] out = new double[nVars][nVars]; + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < nVars; j++) { + if (i == j) { + out[i][j] = 0d; + } else { + double r = correlationMatrix.getEntry(i, j); + double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); + out[i][j] = 2 * tDistribution.cumulativeProbability(-t); + } + } + } + return new BlockRealMatrix(out); + } + + + /** + * Computes the correlation matrix for the columns of the + * input matrix, using {@link #correlation(double[], double[])}. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. Pairwise correlations are set to NaN if one + * of the correlates has zero variance. + * + * @param matrix matrix with columns representing variables to correlate + * @return correlation matrix + * @throws MathIllegalArgumentException if the matrix does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { + checkSufficientData(matrix); + int nVars = matrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + for (int j = 0; j < i; j++) { + double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); + outMatrix.setEntry(i, j, corr); + outMatrix.setEntry(j, i, corr); + } + outMatrix.setEntry(i, i, 1d); + } + return outMatrix; + } + + /** + * Computes the correlation matrix for the columns of the + * input rectangular array. The columns of the array represent values + * of variables to be correlated. + * + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows or if the array is not rectangular. Pairwise + * correlations are set to NaN if one of the correlates has zero variance. + * + * @param data matrix with columns representing variables to correlate + * @return correlation matrix + * @throws MathIllegalArgumentException if the array does not contain sufficient data + * @see #correlation(double[], double[]) + */ + public RealMatrix computeCorrelationMatrix(double[][] data) { + return computeCorrelationMatrix(new BlockRealMatrix(data)); + } + + /** + * Computes the Pearson's product-moment correlation coefficient between two arrays. + * + * <p>Throws MathIllegalArgumentException if the arrays do not have the same length + * or their common length is less than 2. Returns {@code NaN} if either of the arrays + * has zero variance (i.e., if one of the arrays does not contain at least two distinct + * values).</p> + * + * @param xArray first data array + * @param yArray second data array + * @return Returns Pearson's correlation coefficient for the two arrays + * @throws DimensionMismatchException if the arrays lengths do not match + * @throws MathIllegalArgumentException if there is insufficient data + */ + public double correlation(final double[] xArray, final double[] yArray) { + SimpleRegression regression = new SimpleRegression(); + if (xArray.length != yArray.length) { + throw new DimensionMismatchException(xArray.length, yArray.length); + } else if (xArray.length < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, + xArray.length, 2); + } else { + for(int i=0; i<xArray.length; i++) { + regression.addData(xArray[i], yArray[i]); + } + return regression.getR(); + } + } + + /** + * Derives a correlation matrix from a covariance matrix. + * + * <p>Uses the formula <br/> + * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where + * <code>r(·,·)</code> is the correlation coefficient and + * <code>s(·)</code> means standard deviation.</p> + * + * @param covarianceMatrix the covariance matrix + * @return correlation matrix + */ + public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { + int nVars = covarianceMatrix.getColumnDimension(); + RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); + for (int i = 0; i < nVars; i++) { + double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); + outMatrix.setEntry(i, i, 1d); + for (int j = 0; j < i; j++) { + double entry = covarianceMatrix.getEntry(i, j) / + (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); + outMatrix.setEntry(i, j, entry); + outMatrix.setEntry(j, i, entry); + } + } + return outMatrix; + } + + /** + * Throws MathIllegalArgumentException if the matrix does not have at least + * two columns and two rows. + * + * @param matrix matrix to check for sufficiency + * @throws MathIllegalArgumentException if there is insufficient data + */ + private void checkSufficientData(final RealMatrix matrix) { + int nRows = matrix.getRowDimension(); + int nCols = matrix.getColumnDimension(); + if (nRows < 2 || nCols < 2) { + throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, + nRows, nCols); + } + } +} |