blob: 338e6a18f1bcf941beb868f9fa10d072d61ad522 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# This script performs Principal Component Analysis (PCA) on the given input data.
#
args <- commandArgs(TRUE)
library("Matrix")
A = readMM(paste(args[1], "A.mtx", sep=""));
K = ncol(A);
projectData = 0;
model = "";
center = 0;
scale = 0;
if (model != "") {
# reuse existing model to project data
} else if (model == "") {
N = nrow(A);
D = ncol(A);
# 1. perform z-scoring (centering and scaling)
if (center == 1) {
cm = matrix(1, nrow(A), 1) %*% colMeans(A);
A = A - cm
}
if (scale == 1) {
cvars = (colSums(A^2));
if (center == 1){
#cm = colMeans(A);
cvars = (cvars - N*(colMeans(A)^2))/(N-1);
}
Azscored = A / sqrt(cvars);
A = Azscored;
}
# 2. compute co-variance matrix
mu = colSums(A)/N;
C = (t(A) %*% A)/(N-1) - (N/(N-1))*(mu) %*% t(mu);
# 3. compute eigen vectors and values
R <- eigen(C);
evalues = R$values;
evectors = R$vectors;
# 4. make an index of values sorted according to magnitude of evalues
decreasing_Idx = order(as.vector(evalues), decreasing=TRUE);
diagmat = table(seq(1,D), decreasing_Idx);
# 5. sorts eigen values by decreasing order
evalues = diagmat %*% evalues;
# 6. sorts eigen vectors column-wise in the order of decreasing eigen values
evectors = evectors %*% diagmat;
# 7. select K dominant eigen vectors
nvec = ncol(evectors); # Here `nvec=K`
eval_dominant = evalues[1:K, 1];
evec_dominant = evectors[1:K,];
# 8. compute the std. deviation of dominant evalues
eval_stdev_dominant = sqrt(eval_dominant);
writeMM(as(eval_stdev_dominant, "CsparseMatrix"), paste(args[2],"dominant.eigen.standard.deviations", sep=""));
writeMM(as(eval_dominant, "CsparseMatrix"), paste(args[2], "dominant.eigen.values", sep=""));
writeMM(as(evec_dominant, "CsparseMatrix"), paste(args[2],"dominant.eigen.vectors", sep=""));
}