| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # This script performs Principal Component Analysis (PCA) on the given input data. |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # INPUT String --- Location to read the matrix A of feature vectors |
| # K Int --- Indicates dimension of the new vector space constructed from eigen vectors |
| # CENTER Int 0 Indicates whether or not to center data |
| # SCALE Int 0 Indicates whether or not to scale data |
| # OFMT String --- Output data format |
| # PROJDATA Int 0 This argument indicates if the data should be projected or not |
| # MODEL String --- Location to already existing model: eigenvectors and eigenvalues |
| # OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors, |
| # and data projected onto new basis vectors) |
| # hadoop jar SystemDS.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 |
| # OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1 |
| # --------------------------------------------------------------------------------------------- |
| |
| A = read($INPUT); |
| K = ifdef($K, ncol(A)); |
| ofmt = ifdef($OFMT, "CSV"); |
| projectData = ifdef($PROJDATA,0); |
| model = ifdef($MODEL,""); |
| center = ifdef($CENTER,0); |
| scale = ifdef($SCALE,0); |
| output = ifdef($OUTPUT,"/"); |
| |
| evec_dominant = matrix(0,cols=1,rows=1); |
| |
| if (model != "") { |
| # reuse existing model to project data |
| evec_dominant = read(model+"/dominant.eigen.vectors"); |
| }else{ |
| if (model == "" ){ |
| model = output; |
| } |
| |
| N = nrow(A); |
| D = ncol(A); |
| |
| # perform z-scoring (centering and scaling) |
| A = scale(A, center==1, scale==1); |
| |
| # co-variance matrix |
| mu = colSums(A)/N; |
| C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu; |
| |
| |
| # compute eigen vectors and values |
| [evalues, evectors] = eigen(C); |
| |
| decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); |
| diagmat = table(seq(1,D),decreasing_Idx); |
| # sorts eigenvalues by decreasing order |
| evalues = diagmat %*% evalues; |
| # sorts eigenvectors column-wise in the order of decreasing eigenvalues |
| evectors = evectors %*% diagmat; |
| |
| |
| # select K dominant eigen vectors |
| nvec = ncol(evectors); |
| |
| eval_dominant = evalues[1:K, 1]; |
| evec_dominant = evectors[,1:K]; |
| |
| # the square root of eigenvalues |
| eval_stdev_dominant = sqrt(eval_dominant); |
| |
| write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt); |
| write(eval_dominant, model+"/dominant.eigen.values", format=ofmt); |
| write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt); |
| } |
| if (projectData == 1 | model != ""){ |
| # Construct new data set by treating computed dominant eigenvectors as the basis vectors |
| newA = A %*% evec_dominant; |
| write(newA, output+"/projected.data", format=ofmt); |
| } |