blob: 2683f904e70526af5dc0df21e5d104108fb657b7 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
# Platforms. Tarek Elgamal et.al.
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix --- n x m input feature matrix
# k Integer --- indicates dimension of the new vector space constructed from eigen vectors
# maxi Integer --- maximum number of iterations until convergence
# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm
# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm
# verbose Boolen TRUE verbose debug output
# ---------------------------------------------------------------------------------------------
# OUTPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# Xout Matrix --- Output feature matrix with K columns
# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10,
Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
return(Matrix[Double] Xout, Matrix[Double] Mout)
{
n = nrow(X);
m = ncol(X);
#initializing principal components matrix
C = rand(rows=m, cols=K, pdf="normal");
ss = rand(rows=1, cols=1, pdf="normal");
ss = as.scalar(ss);
ssPrev = ss;
# best selected principle components - with the lowest reconstruction error
PC = C;
# initilizing reconstruction error
RE = tolrecerr+1;
REBest = RE;
Z = matrix(0,rows=1,cols=1);
#Objective function value
ObjRelChng = tolobj+1;
# mean centered input matrix - dim -> [n,m]
Xm = X - colMeans(X);
#I -> k x k
ITMP = matrix(1,rows=K,cols=1);
I = diag(ITMP);
i = 0;
while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){
#Estimation step - Covariance matrix
#M -> k x k
M = t(C) %*% C + I*ss;
#Auxilary matrix with n latent variables
# Z -> n x k
Z = Xm %*% (C %*% inv(M));
#ZtZ -> k x k
ZtZ = t(Z) %*% Z + inv(M)*ss;
#XtZ -> m x k
XtZ = t(Xm) %*% Z;
#Maximization step
#C -> m x k
ZtZ_sum = sum(ZtZ); #+n*inv(M));
C = XtZ/ZtZ_sum;
#ss2 -> 1 x 1
ss2 = trace(ZtZ * (t(C) %*% C));
#ss3 -> 1 x 1
ss3 = sum((Z %*% t(C)) %*% t(Xm));
#Frobenius norm of reconstruction error -> Euclidean norm
#Fn -> 1 x 1
Fn = sum(Xm*Xm);
#ss -> 1 x 1
ss = (Fn + ss2 - 2*ss3)/(n*m);
#calculating objective function relative change
ObjRelChng = abs(1 - ss/ssPrev);
#print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
#Reconstruction error
R = ((Z %*% t(C)) - Xm);
#calculate the error
#TODO rethink calculation of reconstruction error ....
#1-Norm of reconstruction error - a big dense matrix
#RE -> n x m
RE = abs(sum(R)/sum(Xm));
if (RE < REBest){
PC = C;
REBest = RE;
}
#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
ssPrev = ss;
i = i+1;
}
if( verbose )
print("Objective Relative Change: " + ObjRelChng);
if( verbose )
print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
# reconstructs data
# RD -> n x k
Xout = X %*% PC;
# calculate eigenvalues - principle component variance
RDMean = colMeans(Xout);
V = t(colMeans(Xout^2) - (RDMean^2));
# sorting eigenvalues and eigenvectors in decreasing order
V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
VF_decr = table(seq(1,nrow(V)),V_decr_idx);
Mout = PC %*% VF_decr; # vectors (values via VF_decr %*% V)
}