blob: b968162132038598e82164acdf4ff84eb88e3b2a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Principal Component Analysis (PCA) for dimensionality reduction
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix --- Input feature matrix
# K Int --- Number of reduced dimensions (i.e., columns)
# Center Boolean TRUE Indicates whether or not to center the feature matrix
# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
# ---------------------------------------------------------------------------------------------
# Xout Matrix --- Output feature matrix with K columns
# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
# ---------------------------------------------------------------------------------------------
m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
return (Matrix[Double] Xout, Matrix[Double] Mout)
{
N = nrow(X);
D = ncol(X);
# perform z-scoring (centering and scaling)
X = scale(X, center, scale);
# co-variance matrix
mu = colSums(X)/N;
C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu;
# compute eigen vectors and values
[evalues, evectors] = eigen(C);
decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
diagmat = table(seq(1,D),decreasing_Idx);
# sorts eigenvalues by decreasing order
evalues = diagmat %*% evalues;
# sorts eigenvectors column-wise in the order of decreasing eigenvalues
evectors = evectors %*% diagmat;
eval_dominant = evalues[1:K, 1];
evec_dominant = evectors[,1:K];
# Construct new data set by treating computed dominant eigenvectors as the basis vectors
Xout = X %*% evec_dominant;
Mout = evec_dominant;
}