blob: 65800b70c45732eeb3a0561ef26b47b582aa49b0 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# This script performs Principal Component Analysis (PCA) on the given input data.
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# INPUT String --- Location to read the matrix A of feature vectors
# K Int --- Indicates dimension of the new vector space constructed from eigen vectors
# CENTER Int 0 Indicates whether or not to center data
# SCALE Int 0 Indicates whether or not to scale data
# OFMT String --- Output data format
# PROJDATA Int 0 This argument indicates if the data should be projected or not
# MODEL String --- Location to already existing model: eigenvectors and eigenvalues
# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors,
# and data projected onto new basis vectors)
# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000
# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1
# ---------------------------------------------------------------------------------------------
A = read($INPUT);
K = ifdef($K, ncol(A));
ofmt = ifdef($OFMT, "CSV");
projectData = ifdef($PROJDATA,0);
model = ifdef($MODEL,"");
center = ifdef($CENTER,0);
scale = ifdef($SCALE,0);
output = ifdef($OUTPUT,"/");
evec_dominant = matrix(0,cols=1,rows=1);
if (model != "") {
# reuse existing model to project data
evec_dominant = read(model+"/dominant.eigen.vectors");
}else{
if (model == "" ){
model = output;
}
N = nrow(A);
D = ncol(A);
# perform z-scoring (centering and scaling)
if (center == 1) {
cm = colMeans(A);
A = A - cm;
}
if (scale == 1) {
cvars = (colSums (A^2));
if (center == 1){
cm = colMeans(A);
cvars = (cvars - N*(cm^2))/(N-1);
}
Azscored = (A)/sqrt(cvars);
A = Azscored;
}
# co-variance matrix
mu = colSums(A)/N;
C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
# compute eigen vectors and values
[evalues, evectors] = eigen(C);
decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
diagmat = table(seq(1,D),decreasing_Idx);
# sorts eigenvalues by decreasing order
evalues = diagmat %*% evalues;
# sorts eigenvectors column-wise in the order of decreasing eigenvalues
evectors = evectors %*% diagmat;
# select K dominant eigen vectors
nvec = ncol(evectors);
eval_dominant = evalues[1:K, 1];
evec_dominant = evectors[,1:K];
# the square root of eigenvalues
eval_stdev_dominant = sqrt(eval_dominant);
write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt);
write(eval_dominant, model+"/dominant.eigen.values", format=ofmt);
write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt);
}
if (projectData == 1 | model != ""){
# Construct new data set by treating computed dominant eigenvectors as the basis vectors
newA = A %*% evec_dominant;
write(newA, output+"/projected.data", format=ofmt);
}