scripts/builtin/pca.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Principal Component Analysis (PCA) for dimensionality reduction
 # ---------------------------------------------------------------------------------------------
 # NAME   TYPE    DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X      Matrix  ---      Input feature matrix
 # K      Int     ---      Number of reduced dimensions (i.e., columns)
 # Center Boolean TRUE     Indicates whether or not to center the feature matrix
 # Scale  Boolean TRUE     Indicates whether or not to scale the feature matrix
 # ---------------------------------------------------------------------------------------------
 # Xout   Matrix  ---      Output feature matrix with K columns
 # Mout   Matrix  ---      Output dominant eigen vectors (can be used for projections)
 # ---------------------------------------------------------------------------------------------

 m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
   return (Matrix[Double] Xout, Matrix[Double] Mout)
 {
   N = nrow(X);
   D = ncol(X);

   # perform z-scoring (centering and scaling)
   X = scale(X, center, scale);

   # co-variance matrix
   mu = colSums(X)/N;
   C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu;

   # compute eigen vectors and values
   [evalues, evectors] = eigen(C);

   decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
   diagmat = table(seq(1,D),decreasing_Idx);
   # sorts eigenvalues by decreasing order
   evalues = diagmat %*% evalues;
   # sorts eigenvectors column-wise in the order of decreasing eigenvalues
   evectors = evectors %*% diagmat;

   eval_dominant = evalues[1:K, 1];
   evec_dominant = evectors[,1:K];

   # Construct new data set by treating computed dominant eigenvectors as the basis vectors
   Xout = X %*% evec_dominant;
   Mout = evec_dominant;
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Principal Component Analysis (PCA) for dimensionality reduction
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Matrix --- Input feature matrix
	# K Int --- Number of reduced dimensions (i.e., columns)
	# Center Boolean TRUE Indicates whether or not to center the feature matrix
	# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
	# ---------------------------------------------------------------------------------------------
	# Xout Matrix --- Output feature matrix with K columns
	# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
	# ---------------------------------------------------------------------------------------------

	m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
	return (Matrix[Double] Xout, Matrix[Double] Mout)
	{
	N = nrow(X);
	D = ncol(X);

	# perform z-scoring (centering and scaling)
	X = scale(X, center, scale);

	# co-variance matrix
	mu = colSums(X)/N;
	C = (t(X) %% X)/(N-1) - (N/(N-1))t(mu) %*% mu;

	# compute eigen vectors and values
	[evalues, evectors] = eigen(C);

	decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
	diagmat = table(seq(1,D),decreasing_Idx);
	# sorts eigenvalues by decreasing order
	evalues = diagmat %*% evalues;
	# sorts eigenvectors column-wise in the order of decreasing eigenvalues
	evectors = evectors %*% diagmat;

	eval_dominant = evalues[1:K, 1];
	evec_dominant = evectors[,1:K];

	# Construct new data set by treating computed dominant eigenvectors as the basis vectors
	Xout = X %*% evec_dominant;
	Mout = evec_dominant;
	}