blob: dcca0bb19e9f1d678e517c9883b8ed0ef257ce1e [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This builtin defines PCA that is a technique typically used to
# reduce the number of dimensions of a matrix.
# This implementation is based on calculating eigenvectors on
# the covariance matrix of the input.
#
# An example of calling in DML:
#
# .. code-block::
#
# data = read($1)
# [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
# print(Components)
#
#
# An example in a ML pipeline containing PCA:
#
# .. code-block::
#
# X = read($1)
# [X_reduced, Components] = pca(data=X, K=4)
# Y = read($2)
# bias = l2svm(X=X, Y=Y)
# X_test = read($3)
# [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
# write($5, Y_predict_rounded)
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X Input feature matrix
# K Number of components returned
# center Indicates whether or not to center the feature matrix
# scale Indicates whether or not to scale the feature matrix
# onlyComponents Indicate if only the components should be calculated and returned
# not the application of the components on X
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ---------------------------------------------------------------------------
# XReduced Output feature matrix with K columns
# Components Output dominant eigen vectors sorted by influence
# Centering The column means of the input, subtracted to construct the PCA
# ScaleFactor The scaling of the values, to make each dimension same size.
# ---------------------------------------------------------------------------
m_pca = function(Matrix[Double] X, Integer K=2,
Boolean center=TRUE, Boolean scale=TRUE,
Boolean onlyComponents=FALSE)
return (Matrix[Double] XReduced, Matrix[Double] Components,
Matrix[Double] Centering, Matrix[Double] ScaleFactor)
{
N = nrow(X);
D = ncol(X);
if(K > D) {
print("PCA: invalid parameter value")
print("K should not be greater than the number of columns in X")
print("setting K = ncol(X)")
K = D
}
# perform z-scoring (centering and scaling)
[X, Centering, ScaleFactor] = scale(X, center, scale);
mu = colSums(X)/N;
# co-variance matrix minus correction
C = (t(X) %*% X)/(N-1) - (N/(N-1))*t(mu) %*% mu;
# compute eigen vectors and values
[eigen_values, eigen_vectors] = eigen(C);
# Sort eigenvalues by decreasing order
decreasing_Idx = order(target=eigen_values, by=1, decreasing=TRUE, index.return=TRUE);
diagonal_matrix = table(seq(1,D), decreasing_Idx);
# sorts eigenvectors column-wise in the order of decreasing eigen values
sorted_eigenvectors = eigen_vectors %*% diagonal_matrix;
# Slice out the number of requested K eigenvectors
Components = sorted_eigenvectors[,1:K];
if(onlyComponents){
# If only the components are wanted, we return XReduced as empty.
XReduced = matrix(0, rows=0, cols=0)
}
else{
# Construct new data set by treating the computed eigenvectors as the basis vectors
XReduced = X %*% Components;
# Check if the output contains infinity
# This is to avoid spark pulling back the dataset for replace.
containsInf = contains(target=XReduced, pattern=1/0);
if(containsInf){
# replace infinity with zero
XReduced = replace(target=XReduced, pattern=1/0, replacement=0);
}
}
}