| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. |
| # It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed |
| # Platforms. Tarek Elgamal et.al. |
| |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Matrix --- n x m input feature matrix |
| # k Integer --- indicates dimension of the new vector space constructed from eigen vectors |
| # maxi Integer --- maximum number of iterations until convergence |
| # tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm |
| # tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm |
| # verbose Boolen TRUE verbose debug output |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # Xout Matrix --- Output feature matrix with K columns |
| # Mout Matrix --- Output dominant eigen vectors (can be used for projections) |
| |
| |
| m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10, |
| Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE) |
| return(Matrix[Double] Xout, Matrix[Double] Mout) |
| { |
| n = nrow(X); |
| m = ncol(X); |
| |
| #initializing principal components matrix |
| C = rand(rows=m, cols=K, pdf="normal"); |
| ss = rand(rows=1, cols=1, pdf="normal"); |
| ss = as.scalar(ss); |
| ssPrev = ss; |
| |
| # best selected principle components - with the lowest reconstruction error |
| PC = C; |
| |
| # initilizing reconstruction error |
| RE = tolrecerr+1; |
| REBest = RE; |
| |
| Z = matrix(0,rows=1,cols=1); |
| |
| #Objective function value |
| ObjRelChng = tolobj+1; |
| |
| # mean centered input matrix - dim -> [n,m] |
| Xm = X - colMeans(X); |
| |
| #I -> k x k |
| ITMP = matrix(1,rows=K,cols=1); |
| I = diag(ITMP); |
| |
| i = 0; |
| while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){ |
| #Estimation step - Covariance matrix |
| #M -> k x k |
| M = t(C) %*% C + I*ss; |
| |
| #Auxilary matrix with n latent variables |
| # Z -> n x k |
| Z = Xm %*% (C %*% inv(M)); |
| |
| #ZtZ -> k x k |
| ZtZ = t(Z) %*% Z + inv(M)*ss; |
| |
| #XtZ -> m x k |
| XtZ = t(Xm) %*% Z; |
| |
| #Maximization step |
| #C -> m x k |
| ZtZ_sum = sum(ZtZ); #+n*inv(M)); |
| C = XtZ/ZtZ_sum; |
| |
| #ss2 -> 1 x 1 |
| ss2 = trace(ZtZ * (t(C) %*% C)); |
| |
| #ss3 -> 1 x 1 |
| ss3 = sum((Z %*% t(C)) %*% t(Xm)); |
| |
| #Frobenius norm of reconstruction error -> Euclidean norm |
| #Fn -> 1 x 1 |
| Fn = sum(Xm*Xm); |
| |
| #ss -> 1 x 1 |
| ss = (Fn + ss2 - 2*ss3)/(n*m); |
| |
| #calculating objective function relative change |
| ObjRelChng = abs(1 - ss/ssPrev); |
| #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss); |
| |
| #Reconstruction error |
| R = ((Z %*% t(C)) - Xm); |
| |
| #calculate the error |
| #TODO rethink calculation of reconstruction error .... |
| #1-Norm of reconstruction error - a big dense matrix |
| #RE -> n x m |
| RE = abs(sum(R)/sum(Xm)); |
| if (RE < REBest){ |
| PC = C; |
| REBest = RE; |
| } |
| #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE); |
| |
| ssPrev = ss; |
| i = i+1; |
| } |
| if( verbose ) |
| print("Objective Relative Change: " + ObjRelChng); |
| if( verbose ) |
| print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest); |
| |
| # reconstructs data |
| # RD -> n x k |
| Xout = X %*% PC; |
| |
| # calculate eigenvalues - principle component variance |
| RDMean = colMeans(Xout); |
| V = t(colMeans(Xout^2) - (RDMean^2)); |
| |
| # sorting eigenvalues and eigenvectors in decreasing order |
| V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE); |
| VF_decr = table(seq(1,nrow(V)),V_decr_idx); |
| Mout = PC %*% VF_decr; # vectors (values via VF_decr %*% V) |
| } |