blob: 849f6d5b908b57d8a18daa344c01831895090a30 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Computes the parameters needed for Gaussian Classification.
# Thus it computes the following per class: the prior probability,
# the inverse covariance matrix, the mean per feature and the determinant
# of the covariance matrix. Furthermore (if not explicitly defined), it
# adds some small smoothing value along the variances, to prevent
# numerical errors / instabilities.
# ----------------------------------------------------------------------------------
# D Input matrix (training set)
# C Target vector
# varSmoothing Smoothing factor for variances
# verbose Print accuracy of the training set
# ----------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------
# classPriors Vector storing the class prior probabilities
# classMeans Matrix storing the means of the classes
# classInvCovariances List of inverse covariance matrices
# determinants Vector storing the determinants of the classes
# ------------------------------------------------------------------------------------------
m_gaussianClassifier = function(Matrix[Double] D, Matrix[Double] C, Double varSmoothing=1e-9, Boolean verbose = TRUE)
return (Matrix[Double] classPriors, Matrix[Double] classMeans,
List[Unknown] classInvCovariances, Matrix[Double] determinants)
#Retrieve number of samples, classes and features
nSamples = nrow(D)
nClasses = max(C)
nFeats = ncol(D)
#Compute means, variances and priors
classCounts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(nClasses));
classMeans = aggregate(target=D, groups=C, fn="mean", ngroups=as.integer(nClasses));
classVars = aggregate(target=D, groups=C, fn="variance", ngroups=as.integer(nClasses));
classPriors = classCounts / nSamples
smoothedVar = diag(matrix(1.0, rows=nFeats, cols=1)) * max(classVars) * varSmoothing
classInvCovariances = list()
determinants = matrix(0, rows=nClasses, cols=1)
#Compute determinants and inverseCovariances
for (class in 1:nClasses)
covMatrix = matrix(0, rows=nFeats, cols=nFeats)
classMatrix = removeEmpty(target=D, margin="rows", select=(C==class))
# TODO replace with implementation of new built-in for var-cov matrix
# possible vectorized implementation but results are varying in some digits
# difference = classMatrix - classMeans[class,]
# cov_S = 1/nrow(classMatrix) * (t(difference) %*% difference)
for (i in 1:nFeats)
for (j in 1:nFeats)
if (j == i)
covMatrix[i,j] = classVars[class, j]
else if (j < i)
covMatrix[i,j] = covMatrix[j,i]
covMatrix[i,j] = cov(classMatrix[,i], classMatrix[,j])
#Apply smoothing of the variances, to avoid numerical errors
covMatrix = covMatrix + smoothedVar
#Compute inverse
[eVals, eVecs] = eigen(covMatrix)
lam = diag(eVals^(-1))
invCovMatrix = eVecs %*% lam %*% t(eVecs)
#Compute determinant
det = prod(eVals)
determinants[class, 1] = det
classInvCovariances = append(classInvCovariances, invCovMatrix)
#Compute accuracy on the training set
if (verbose)
results = matrix(0, rows=nSamples, cols=nClasses)
parfor (class in 1:nClasses)
for (i in 1:nSamples)
intermediate = 0
meanDiff = (D[i,] - classMeans[class,])
intermediate = -1/2 * log((2*pi)^nFeats * determinants[class,])
intermediate = intermediate - 1/2 * (meanDiff %*% as.matrix(classInvCovariances[class]) %*% t(meanDiff))
intermediate = log(classPriors[class,]) + intermediate
results[i, class] = intermediate
acc = sum(rowIndexMax(results) == C) / nSamples * 100
print("Training Accuracy (%): " + acc)