scripts/builtin/ppca.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
 # It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
 # Platforms. Tarek Elgamal et.al.

 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME      TYPE    DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X         Matrix  ---      n x m input feature matrix
 # k         Integer ---      indicates dimension of the new vector space constructed from eigen vectors
 # maxi      Integer ---      maximum number of iterations until convergence
 # tolobj    Double  0.00001  objective function tolerance value to stop ppca algorithm
 # tolrecerr Double  0.02     reconstruction error tolerance value to stop the algorithm
 # verbose   Boolen  TRUE     verbose debug output
 # ---------------------------------------------------------------------------------------------
 # OUTPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME   TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # Xout   Matrix  ---      Output feature matrix with K columns
 # Mout   Matrix  ---      Output dominant eigen vectors (can be used for projections)


 m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10,
   Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
   return(Matrix[Double] Xout, Matrix[Double] Mout)
 {
   n = nrow(X);
   m = ncol(X);

   #initializing principal components matrix
   C =  rand(rows=m, cols=K, pdf="normal");
   ss = rand(rows=1, cols=1, pdf="normal");
   ss = as.scalar(ss);
   ssPrev = ss;

   # best selected principle components - with the lowest reconstruction error
   PC = C;

   # initilizing reconstruction error
   RE = tolrecerr+1;
   REBest = RE;

   Z = matrix(0,rows=1,cols=1);

   #Objective function value
   ObjRelChng = tolobj+1;

   # mean centered input matrix - dim -> [n,m]
   Xm = X - colMeans(X);

   #I -> k x k
   ITMP = matrix(1,rows=K,cols=1);
   I = diag(ITMP);

   i = 0;
   while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){
     #Estimation step - Covariance matrix
     #M -> k x k
     M = t(C) %*% C + I*ss;

     #Auxilary matrix with n latent variables
     # Z -> n x k
     Z = Xm %*% (C %*% inv(M));

     #ZtZ -> k x k
     ZtZ = t(Z) %*% Z + inv(M)*ss;

     #XtZ -> m x k
     XtZ = t(Xm) %*% Z;

     #Maximization step
     #C ->  m x k
     ZtZ_sum = sum(ZtZ); #+n*inv(M));
     C = XtZ/ZtZ_sum;

     #ss2 -> 1 x 1
     ss2 = trace(ZtZ * (t(C) %*% C));

     #ss3 -> 1 x 1
     ss3 = sum((Z %*% t(C)) %*% t(Xm));

     #Frobenius norm of reconstruction error -> Euclidean norm
     #Fn -> 1 x 1
     Fn = sum(Xm*Xm);

     #ss -> 1 x 1
     ss = (Fn + ss2 - 2*ss3)/(n*m);

     #calculating objective function relative change
     ObjRelChng = abs(1 - ss/ssPrev);
     #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);

     #Reconstruction error
     R = ((Z %*% t(C)) -  Xm);

     #calculate the error
     #TODO rethink calculation of reconstruction error ....
     #1-Norm of reconstruction error - a big dense matrix
     #RE -> n x m
     RE = abs(sum(R)/sum(Xm));
     if (RE < REBest){
       PC = C;
       REBest = RE;
     }
     #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);

     ssPrev = ss;
     i = i+1;
   }
   if( verbose )
     print("Objective Relative Change: " + ObjRelChng);
   if( verbose )
     print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);

   # reconstructs data
   # RD -> n x k
   Xout = X %*% PC;

   # calculate eigenvalues - principle component variance
   RDMean = colMeans(Xout);
   V = t(colMeans(Xout^2) - (RDMean^2));

   # sorting eigenvalues and eigenvectors in decreasing order
   V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
   VF_decr = table(seq(1,nrow(V)),V_decr_idx);
   Mout = PC %*% VF_decr;  # vectors (values via VF_decr %*% V)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
	# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
	# Platforms. Tarek Elgamal et.al.

	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Matrix --- n x m input feature matrix
	# k Integer --- indicates dimension of the new vector space constructed from eigen vectors
	# maxi Integer --- maximum number of iterations until convergence
	# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm
	# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm
	# verbose Boolen TRUE verbose debug output
	# ---------------------------------------------------------------------------------------------
	# OUTPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# Xout Matrix --- Output feature matrix with K columns
	# Mout Matrix --- Output dominant eigen vectors (can be used for projections)


	m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10,
	Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
	return(Matrix[Double] Xout, Matrix[Double] Mout)
	{
	n = nrow(X);
	m = ncol(X);

	#initializing principal components matrix
	C = rand(rows=m, cols=K, pdf="normal");
	ss = rand(rows=1, cols=1, pdf="normal");
	ss = as.scalar(ss);
	ssPrev = ss;

	# best selected principle components - with the lowest reconstruction error
	PC = C;

	# initilizing reconstruction error
	RE = tolrecerr+1;
	REBest = RE;

	Z = matrix(0,rows=1,cols=1);

	#Objective function value
	ObjRelChng = tolobj+1;

	# mean centered input matrix - dim -> [n,m]
	Xm = X - colMeans(X);

	#I -> k x k
	ITMP = matrix(1,rows=K,cols=1);
	I = diag(ITMP);

	i = 0;
	while (i < maxi & ObjRelChng > tolobj & RE > tolrecerr){
	#Estimation step - Covariance matrix
	#M -> k x k
	M = t(C) %% C + Iss;

	#Auxilary matrix with n latent variables
	# Z -> n x k
	Z = Xm %% (C %% inv(M));

	#ZtZ -> k x k
	ZtZ = t(Z) %% Z + inv(M)ss;

	#XtZ -> m x k
	XtZ = t(Xm) %*% Z;

	#Maximization step
	#C -> m x k
	ZtZ_sum = sum(ZtZ); #+n*inv(M));
	C = XtZ/ZtZ_sum;

	#ss2 -> 1 x 1
	ss2 = trace(ZtZ * (t(C) %*% C));

	#ss3 -> 1 x 1
	ss3 = sum((Z %% t(C)) %% t(Xm));

	#Frobenius norm of reconstruction error -> Euclidean norm
	#Fn -> 1 x 1
	Fn = sum(Xm*Xm);

	#ss -> 1 x 1
	ss = (Fn + ss2 - 2ss3)/(nm);

	#calculating objective function relative change
	ObjRelChng = abs(1 - ss/ssPrev);
	#print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);

	#Reconstruction error
	R = ((Z %*% t(C)) - Xm);

	#calculate the error
	#TODO rethink calculation of reconstruction error ....
	#1-Norm of reconstruction error - a big dense matrix
	#RE -> n x m
	RE = abs(sum(R)/sum(Xm));
	if (RE < REBest){
	PC = C;
	REBest = RE;
	}
	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);

	ssPrev = ss;
	i = i+1;
	}
	if( verbose )
	print("Objective Relative Change: " + ObjRelChng);
	if( verbose )
	print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);

	# reconstructs data
	# RD -> n x k
	Xout = X %*% PC;

	# calculate eigenvalues - principle component variance
	RDMean = colMeans(Xout);
	V = t(colMeans(Xout^2) - (RDMean^2));

	# sorting eigenvalues and eigenvectors in decreasing order
	V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
	VF_decr = table(seq(1,nrow(V)),V_decr_idx);
	Mout = PC %% VF_decr; # vectors (values via VF_decr %% V)
	}