| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. |
| # It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed |
| # Platforms. Tarek Elgamal et.al. |
| |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- location to read the matrix X input matrix |
| # k Int --- indicates dimension of the new vector space constructed from eigen vectors |
| # tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm |
| # tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm |
| # iter Int 10 maximum number of iterations |
| # fmt String 'text' output format of results PPCA such as "text" or "csv" |
| # hadoop jar SystemDS.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100 |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # C Matrix --- principal components |
| # V Matrix --- eigenvalues / eigenvalues of principal components |
| # |
| |
| X = read($X); |
| |
| fileC = $C; |
| fileV = $V; |
| |
| k = ifdef($k, ncol(X)); |
| iter = ifdef($iter, 10); |
| tolobj = ifdef($tolobj, 0.00001); |
| tolrecerr = ifdef($tolrecerr, 0.02); |
| fmt0 = ifdef($fmt, "text"); |
| |
| n = nrow(X); |
| m = ncol(X); |
| |
| #initializing principal components matrix |
| C = rand(rows=m, cols=k, pdf="normal"); |
| ss = rand(rows=1, cols=1, pdf="normal"); |
| ss = as.scalar(ss); |
| ssPrev = ss; |
| |
| # best selected principle components - with the lowest reconstruction error |
| PC = C; |
| |
| # initilizing reconstruction error |
| RE = tolrecerr+1; |
| REBest = RE; |
| |
| Z = matrix(0,rows=1,cols=1); |
| |
| #Objective function value |
| ObjRelChng = tolobj+1; |
| |
| # mean centered input matrix - dim -> [n,m] |
| Xm = X - colMeans(X); |
| |
| #I -> k x k |
| ITMP = matrix(1,rows=k,cols=1); |
| I = diag(ITMP); |
| |
| i = 0; |
| while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){ |
| #Estimation step - Covariance matrix |
| #M -> k x k |
| M = t(C) %*% C + I*ss; |
| |
| #Auxilary matrix with n latent variables |
| # Z -> n x k |
| Z = Xm %*% (C %*% inv(M)); |
| |
| #ZtZ -> k x k |
| ZtZ = t(Z) %*% Z + inv(M)*ss; |
| |
| #XtZ -> m x k |
| XtZ = t(Xm) %*% Z; |
| |
| #Maximization step |
| #C -> m x k |
| ZtZ_sum = sum(ZtZ); #+n*inv(M)); |
| C = XtZ/ZtZ_sum; |
| |
| #ss2 -> 1 x 1 |
| ss2 = trace(ZtZ * (t(C) %*% C)); |
| |
| #ss3 -> 1 x 1 |
| ss3 = sum((Z %*% t(C)) %*% t(Xm)); |
| |
| #Frobenius norm of reconstruction error -> Euclidean norm |
| #Fn -> 1 x 1 |
| Fn = sum(Xm*Xm); |
| |
| #ss -> 1 x 1 |
| ss = (Fn + ss2 - 2*ss3)/(n*m); |
| |
| #calculating objective function relative change |
| ObjRelChng = abs(1 - ss/ssPrev); |
| #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss); |
| |
| #Reconstruction error |
| R = ((Z %*% t(C)) - Xm); |
| |
| #calculate the error |
| #TODO rethink calculation of reconstruction error .... |
| #1-Norm of reconstruction error - a big dense matrix |
| #RE -> n x m |
| RE = abs(sum(R)/sum(Xm)); |
| if (RE < REBest){ |
| PC = C; |
| REBest = RE; |
| } |
| #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE); |
| |
| ssPrev = ss; |
| i = i+1; |
| } |
| print("Objective Relative Change: " + ObjRelChng); |
| print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest); |
| |
| # reconstructs data |
| # RD -> n x k |
| RD = X %*% PC; |
| |
| # calculate eigenvalues - principle component variance |
| RDMean = colMeans(RD); |
| V = t(colMeans(RD*RD) - (RDMean*RDMean)); |
| |
| # sorting eigenvalues and eigenvectors in decreasing order |
| V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE); |
| VF_decr = table(seq(1,nrow(V)),V_decr_idx); |
| V = VF_decr %*% V; |
| PC = PC %*% VF_decr; |
| |
| # writing principal components |
| write(PC, fileC, format=fmt0); |
| # writing eigen values/pc variance |
| write(V, fileV, format=fmt0); |