blob: abfcdb1431f04b57898984a5d800b858a93352a2 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
# Platforms. Tarek Elgamal et.al.
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- location to read the matrix X input matrix
# k Int --- indicates dimension of the new vector space constructed from eigen vectors
# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm
# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm
# iter Int 10 maximum number of iterations
# fmt String 'text' output format of results PPCA such as "text" or "csv"
# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
# ---------------------------------------------------------------------------------------------
# OUTPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# C Matrix --- principal components
# V Matrix --- eigenvalues / eigenvalues of principal components
#
X = read($X);
fileC = $C;
fileV = $V;
k = ifdef($k, ncol(X));
iter = ifdef($iter, 10);
tolobj = ifdef($tolobj, 0.00001);
tolrecerr = ifdef($tolrecerr, 0.02);
fmt0 = ifdef($fmt, "text");
n = nrow(X);
m = ncol(X);
#initializing principal components matrix
C = rand(rows=m, cols=k, pdf="normal");
ss = rand(rows=1, cols=1, pdf="normal");
ss = as.scalar(ss);
ssPrev = ss;
# best selected principle components - with the lowest reconstruction error
PC = C;
# initilizing reconstruction error
RE = tolrecerr+1;
REBest = RE;
Z = matrix(0,rows=1,cols=1);
#Objective function value
ObjRelChng = tolobj+1;
# mean centered input matrix - dim -> [n,m]
Xm = X - colMeans(X);
#I -> k x k
ITMP = matrix(1,rows=k,cols=1);
I = diag(ITMP);
i = 0;
while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
#Estimation step - Covariance matrix
#M -> k x k
M = t(C) %*% C + I*ss;
#Auxilary matrix with n latent variables
# Z -> n x k
Z = Xm %*% (C %*% inv(M));
#ZtZ -> k x k
ZtZ = t(Z) %*% Z + inv(M)*ss;
#XtZ -> m x k
XtZ = t(Xm) %*% Z;
#Maximization step
#C -> m x k
ZtZ_sum = sum(ZtZ); #+n*inv(M));
C = XtZ/ZtZ_sum;
#ss2 -> 1 x 1
ss2 = trace(ZtZ * (t(C) %*% C));
#ss3 -> 1 x 1
ss3 = sum((Z %*% t(C)) %*% t(Xm));
#Frobenius norm of reconstruction error -> Euclidean norm
#Fn -> 1 x 1
Fn = sum(Xm*Xm);
#ss -> 1 x 1
ss = (Fn + ss2 - 2*ss3)/(n*m);
#calculating objective function relative change
ObjRelChng = abs(1 - ss/ssPrev);
#print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
#Reconstruction error
R = ((Z %*% t(C)) - Xm);
#calculate the error
#TODO rethink calculation of reconstruction error ....
#1-Norm of reconstruction error - a big dense matrix
#RE -> n x m
RE = abs(sum(R)/sum(Xm));
if (RE < REBest){
PC = C;
REBest = RE;
}
#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
ssPrev = ss;
i = i+1;
}
print("Objective Relative Change: " + ObjRelChng);
print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
# reconstructs data
# RD -> n x k
RD = X %*% PC;
# calculate eigenvalues - principle component variance
RDMean = colMeans(RD);
V = t(colMeans(RD*RD) - (RDMean*RDMean));
# sorting eigenvalues and eigenvectors in decreasing order
V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
VF_decr = table(seq(1,nrow(V)),V_decr_idx);
V = VF_decr %*% V;
PC = PC %*% VF_decr;
# writing principal components
write(PC, fileC, format=fmt0);
# writing eigen values/pc variance
write(V, fileV, format=fmt0);