scripts/staging/PPCA.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
 # It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
 # Platforms. Tarek Elgamal et.al.

 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME   	TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X  	 	String ---      location to read the matrix X input matrix
 # k      	Int    ---      indicates dimension of the new vector space constructed from eigen vectors
 # tolobj 	Int    0.00001  objective function tolerance value to stop ppca algorithm
 # tolrecerr	Int    0.02     reconstruction error tolerance value to stop the algorithm
 # iter   	Int    10       maximum number of iterations
 # fmt    	String 'text'   output format of results PPCA such as "text" or "csv"
 # hadoop jar SystemDS.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X  C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
 # ---------------------------------------------------------------------------------------------
 # OUTPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME   TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # C     	Matrix  ---     principal components
 # V      	Matrix  ---     eigenvalues / eigenvalues of principal components
 #

 X = read($X);

 fileC = $C;
 fileV = $V;

 k = ifdef($k, ncol(X));
 iter = ifdef($iter, 10);
 tolobj = ifdef($tolobj, 0.00001);
 tolrecerr = ifdef($tolrecerr, 0.02);
 fmt0 = ifdef($fmt, "text");

 n = nrow(X);
 m = ncol(X);

 #initializing principal components matrix
 C =  rand(rows=m, cols=k, pdf="normal");
 ss = rand(rows=1, cols=1, pdf="normal");
 ss = as.scalar(ss);
 ssPrev = ss;

 # best selected principle components - with the lowest reconstruction error
 PC = C;

 # initilizing reconstruction error
 RE = tolrecerr+1;
 REBest = RE;

 Z = matrix(0,rows=1,cols=1);

 #Objective function value
 ObjRelChng = tolobj+1;

 # mean centered input matrix - dim -> [n,m]
 Xm = X - colMeans(X);

 #I -> k x k
 ITMP = matrix(1,rows=k,cols=1);
 I = diag(ITMP);

 i = 0;
 while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
 	#Estimation step - Covariance matrix
 	#M -> k x k
 	M = t(C) %*% C + I*ss;

 	#Auxilary matrix with n latent variables
 	# Z -> n x k
 	Z = Xm %*% (C %*% inv(M));

 	#ZtZ -> k x k
 	ZtZ = t(Z) %*% Z + inv(M)*ss;

 	#XtZ -> m x k
 	XtZ = t(Xm) %*% Z;

 	#Maximization step
 	#C ->  m x k
 	ZtZ_sum = sum(ZtZ); #+n*inv(M));
 	C = XtZ/ZtZ_sum;

 	#ss2 -> 1 x 1
 	ss2 = trace(ZtZ * (t(C) %*% C));

 	#ss3 -> 1 x 1
 	ss3 = sum((Z %*% t(C)) %*% t(Xm));

 	#Frobenius norm of reconstruction error -> Euclidean norm
 	#Fn -> 1 x 1
 	Fn = sum(Xm*Xm);

 	#ss -> 1 x 1
 	ss = (Fn + ss2 - 2*ss3)/(n*m);

    #calculating objective function relative change
    ObjRelChng = abs(1 - ss/ssPrev);
    #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);

 	#Reconstruction error
 	R = ((Z %*% t(C)) -  Xm);

 	#calculate the error
 	#TODO rethink calculation of reconstruction error ....
 	#1-Norm of reconstruction error - a big dense matrix
 	#RE -> n x m
 	RE = abs(sum(R)/sum(Xm));
 	if (RE < REBest){
 		PC = C;
 		REBest = RE;
 	}
 	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);

 	ssPrev = ss;
 	i = i+1;
 }
 print("Objective Relative Change: " + ObjRelChng);
 print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);

 # reconstructs data
 # RD -> n x k
 RD = X %*% PC;

 # calculate eigenvalues - principle component variance
 RDMean = colMeans(RD);
 V = t(colMeans(RD*RD) - (RDMean*RDMean));

 # sorting eigenvalues and eigenvectors in decreasing order
 V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
 VF_decr = table(seq(1,nrow(V)),V_decr_idx);
 V = VF_decr %*% V;
 PC = PC %*% VF_decr;

 # writing principal components
 write(PC, fileC, format=fmt0);
 # writing eigen values/pc variance
 write(V, fileV, format=fmt0);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
	# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
	# Platforms. Tarek Elgamal et.al.

	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X String --- location to read the matrix X input matrix
	# k Int --- indicates dimension of the new vector space constructed from eigen vectors
	# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm
	# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm
	# iter Int 10 maximum number of iterations
	# fmt String 'text' output format of results PPCA such as "text" or "csv"
	# hadoop jar SystemDS.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
	# ---------------------------------------------------------------------------------------------
	# OUTPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# C Matrix --- principal components
	# V Matrix --- eigenvalues / eigenvalues of principal components
	#

	X = read($X);

	fileC = $C;
	fileV = $V;

	k = ifdef($k, ncol(X));
	iter = ifdef($iter, 10);
	tolobj = ifdef($tolobj, 0.00001);
	tolrecerr = ifdef($tolrecerr, 0.02);
	fmt0 = ifdef($fmt, "text");

	n = nrow(X);
	m = ncol(X);

	#initializing principal components matrix
	C = rand(rows=m, cols=k, pdf="normal");
	ss = rand(rows=1, cols=1, pdf="normal");
	ss = as.scalar(ss);
	ssPrev = ss;

	# best selected principle components - with the lowest reconstruction error
	PC = C;

	# initilizing reconstruction error
	RE = tolrecerr+1;
	REBest = RE;

	Z = matrix(0,rows=1,cols=1);

	#Objective function value
	ObjRelChng = tolobj+1;

	# mean centered input matrix - dim -> [n,m]
	Xm = X - colMeans(X);

	#I -> k x k
	ITMP = matrix(1,rows=k,cols=1);
	I = diag(ITMP);

	i = 0;
	while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
	#Estimation step - Covariance matrix
	#M -> k x k
	M = t(C) %% C + Iss;

	#Auxilary matrix with n latent variables
	# Z -> n x k
	Z = Xm %% (C %% inv(M));

	#ZtZ -> k x k
	ZtZ = t(Z) %% Z + inv(M)ss;

	#XtZ -> m x k
	XtZ = t(Xm) %*% Z;

	#Maximization step
	#C -> m x k
	ZtZ_sum = sum(ZtZ); #+n*inv(M));
	C = XtZ/ZtZ_sum;

	#ss2 -> 1 x 1
	ss2 = trace(ZtZ * (t(C) %*% C));

	#ss3 -> 1 x 1
	ss3 = sum((Z %% t(C)) %% t(Xm));

	#Frobenius norm of reconstruction error -> Euclidean norm
	#Fn -> 1 x 1
	Fn = sum(Xm*Xm);

	#ss -> 1 x 1
	ss = (Fn + ss2 - 2ss3)/(nm);

	#calculating objective function relative change
	ObjRelChng = abs(1 - ss/ssPrev);
	#print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);

	#Reconstruction error
	R = ((Z %*% t(C)) - Xm);

	#calculate the error
	#TODO rethink calculation of reconstruction error ....
	#1-Norm of reconstruction error - a big dense matrix
	#RE -> n x m
	RE = abs(sum(R)/sum(Xm));
	if (RE < REBest){
	PC = C;
	REBest = RE;
	}
	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);

	ssPrev = ss;
	i = i+1;
	}
	print("Objective Relative Change: " + ObjRelChng);
	print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);

	# reconstructs data
	# RD -> n x k
	RD = X %*% PC;

	# calculate eigenvalues - principle component variance
	RDMean = colMeans(RD);
	V = t(colMeans(RDRD) - (RDMeanRDMean));

	# sorting eigenvalues and eigenvectors in decreasing order
	V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
	VF_decr = table(seq(1,nrow(V)),V_decr_idx);
	V = VF_decr %*% V;
	PC = PC %*% VF_decr;

	# writing principal components
	write(PC, fileC, format=fmt0);
	# writing eigen values/pc variance
	write(V, fileV, format=fmt0);