blob: a28dfd93b89e4fe4ab2fc7145fb76fab48534db1 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# ------------------------------------------
# Gaussian Mixture Model
# ------------------------------------------
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Matrix X
# n_components Integer 3 Number of n_components in the Gaussian mixture model
# model String "VVV" "VVV": unequal variance (full),each component has its own general covariance matrix
# "EEE": equal variance (tied), all components share the same general covariance matrix
# "VVI": spherical, unequal volume (diag), each component has its own diagonal
# covariance matrix
# "VII": spherical, equal volume (spherical), each component has its own single variance
# init_param String "kmeans" initialize weights with "kmeans" or "random"
# iterations Integer 100 Number of iterations
# reg_covar Double 1e-6 regularization parameter for covariance matrix
# tol Double 0.000001 tolerance value for convergence
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# weight Double --- A matrix whose [i,k]th entry is the probability that observation i in the test data
# belongs to the kth class
# labels Double --- Prediction matrix
# df Integer --- Number of estimated parameters
# bic Double --- Bayesian information criterion for best iteration
m_gmm = function(Matrix[Double] X, Integer n_components = 3, String model = "VVV", String init_params = "kmeans",
Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Integer seed = -1, Boolean verbose = FALSE )
return (Matrix[Double] labels, Matrix[Double] predict_prob, Integer df, Double bic,
Matrix[Double] mu, Matrix[Double] prec_chol, Matrix[Double] weight)
{
# sanity checks
if(model != "VVV" & model != "EEE" & model != "VVI" & model != "VII")
stop("model not supported, should be in VVV, EEE, VVI, VII");
[labels, predict_prob, norm, mu, prec_chol, weight] = fit(X, n_components,
model, init_params, iter, reg_covar, tol, seed, verbose)
df = estimate_free_param(n_components, ncol(X), model)
bic = getBIC(nrow(X), norm, df)
}
fit = function(Matrix[Double] X, Integer n_components, String model, String init_params,
Integer iter, Double reg_covar, Double tol, Integer seed, Boolean verbose)
return (Matrix[Double] label, Matrix[Double] predict_prob, Double log_prob_norm,
Matrix[Double] mean, Matrix[Double] precision_chol, Matrix[Double] weight)
{
et = FALSE
lower_bound = 0
converged = FALSE
[weight, mean, sigma, precision_chol] = initialize_param(X, n_components,init_params, model, reg_covar, tol, seed)
i = 1
while(i <= iter & !converged & !et) {
prev_lower_bound = lower_bound
[log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
[weight, mean, sigma, precision_chol, et] = m_step(X, log_resp, n_components, model, reg_covar)
lower_bound = log_prob_norm
change = lower_bound - prev_lower_bound
converged = (abs(change) < tol)
if(verbose) {
print("executing " +i+" iteration")
print("converged " +converged)
print("diff: "+(abs(change))+" tol: "+tol)
}
i = i+1
}
if(et) {
print("warning: did not converge because some components have ill-defined empirical covariance
(i.e., singleton matrix or non-symmetric).
\nTry to decrease the number of components, or increase reg_covar")
label = rowIndexMax(weighted_log_prob)
predict_prob = exp(log_resp)
}
else {
[log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
label = rowIndexMax(weighted_log_prob)
predict_prob = exp(log_resp)
}
}
initialize_param = function(Matrix[Double] X, Integer n_components, String init_params,
String model, Double reg_covar, Double tol , Integer seed)
return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol)
{
# create responsibility matrix, resp[n_samples, n_components]
resp = matrix(0, nrow(X), n_components)
if(init_params == "kmeans") {
[C, Y] = kmeans(X=X, k=n_components, runs=10,
eps=tol, is_verbose=FALSE, avg_sample_size_per_centroid=100)
resp = ((resp + t(seq(1, n_components))) == Y)
}
else if(init_params == "random") {
resp = Rand(rows = nrow(X), cols=n_components, seed=seed)
resp = resp/rowSums(resp)
}
else stop("invalid parameter value, expected kmeans or random found "+init_params)
[weight, mean, sigma] = estimate_gaussian_param(X, resp, n_components, model, reg_covar)
weight = weight/nrow(X)
[precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
if(et)
stop("Fitting the mixture model failed because some components have ill-defined empirical covariance
(i.e., singleton matrix or non-symmetric).
\nTry to decrease the number of components, or increase reg_covar")
}
estimate_gaussian_param = function(Matrix[Double] X, Matrix[Double] resp,
Integer n_components, String model, Double reg_covar)
return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma)
{
MACHINE_PRECISION = 2.22e-16
# estimate Gaussian parameter
weight = colSums(resp) + MACHINE_PRECISION # adding machine precision
mean = (t(resp) %*% X) / t(weight) # mean dims: n_components * n_features
if(model == "VVV") {
# output: (sigma a list of length = n_components where each item in list is a covariance matrix of (
# n_features * n_features) dimensions) all rbind in a matrix form
sigma = matrix(0, 0, ncol(X))
for(k in 1:nrow(mean)) {
diff = X - mean[k,]
cov = (t(diff * resp[, k]) %*% diff) / as.scalar(weight[1,k])
cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
sigma = rbind(sigma, cov)
}
}
else if(model == "EEE") {
# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_features * n_features) dimensions)
# all rbind in a matrix form
avgX2 = t(X) %*% X
avgMean = (t(mean) * weight) %*% mean
cov = avgX2 - avgMean
cov = cov / sum(weight)
cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
sigma = cov
}
else if(model == "VVI") {
# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_components * n_features) dimensions)
avgX2 = (t(resp) %*% (X*X)) / t(weight)
avgMean = mean ^ 2
avgMean2 = mean * (t(resp) %*% X) / t(weight)
cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
sigma = cov
}
else if (model == "VII") {
# output: (sigma a list of length = 1 where item in list is a variance value for each component (1* n_components) dimensions)
avgX2 = (t(resp) %*% (X*X)) / t(weight)
avgMean = mean ^ 2
avgMean2 = mean * (t(resp) %*% X) / t(weight)
cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
sigma = rowMeans(cov)
}
}
compute_precision_cholesky = function(Matrix[Double] sigma, String model, Integer n_components)
return (Matrix[Double] precision_chol, Boolean earlyTermination )
{
earlyTermination = FALSE
if(model == "VVV") {
index = 1; k = 1
precision_chol = matrix(0, 0, ncol(sigma))
while(k <= n_components) {
cov = sigma[index:(ncol(sigma)*k), ]
isSPD = checkSPD(cov)
if(isSPD) {
cov_chol = choleskymatrix(cov)
pre_chol = t(inv(cov_chol))
precision_chol = rbind(precision_chol, pre_chol)
index = index + ncol(sigma)
k = k+1
} else {
earlyTermination = TRUE;
k = n_components + 1
}
}
}
else if(model == "EEE") {
cov = sigma
isSPD = checkSPD(cov)
if(isSPD) {
cov_chol = cholesky(cov)
pre_chol = t(inv(cov_chol))
precision_chol = pre_chol
} else
earlyTermination = TRUE
}
else {
cov = sigma
if(sum(cov <= 0) > 0)
earlyTermination = TRUE
else {
precision_chol = 1.0/sqrt(cov)
}
}
}
# Expectation step
e_step = function(Matrix[Double] X, Matrix[Double] w, Matrix[Double] mu,
Matrix[Double] precisions_cholesky, String model)
return(Double norm, Matrix[Double] log_resp, Matrix[Double] weighted_log_prob)
{
weighted_log_prob = estimate_log_gaussian_prob(X, mu, precisions_cholesky, model) + log(w)
log_prob_norm = logSumExp(weighted_log_prob, "rows")
log_resp = weighted_log_prob - log_prob_norm
norm = mean(log_prob_norm)
}
# maximization Step
m_step = function(Matrix[Double] X, Matrix[Double] log_resp, Integer n_components, String model, Double reg_covar)
return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol, Boolean et)
{
[weight, mean, sigma] = estimate_gaussian_param(X, exp(log_resp), n_components, model, reg_covar)
weight = weight/nrow(X)
[precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
}
estimate_log_gaussian_prob = function(Matrix[Double] X, Matrix[Double] mu, Matrix[Double] prec_chol, String model)
return(Matrix[Double] es_log_prob ) # nrow(X) * n_components
{
n_components = nrow(mu)
log_det = compute_log_det_cholesky(prec_chol, model, ncol(X))
if(model == "VVV") {
log_prob = matrix(0, nrow(X), n_components)
i = 1
for(k in 1:n_components) {
prec = prec_chol[i:(k*ncol(X)),]
y = X %*% prec - mu[k,] %*% prec # changing here t intro: y = X %*% prec - mu[k,] %*% prec
log_prob[, k] = rowSums(y*y)
i = i + ncol(X)
}
}
else if(model == "EEE") {
log_prob = matrix(0, nrow(X), n_components)
prec = prec_chol
for(k in 1:n_components) {
y = X %*% prec - mu[k,] %*% prec
log_prob[, k] = rowSums(y*y) # TODO replace y*y with squared built-in
}
}
else if(model == "VVI") {
prec = prec_chol
precisions = prec^2
bc_matrix = matrix(1,nrow(X), nrow(mu))
log_prob = (bc_matrix*t(rowSums(mu^2 * precisions)) -
2. * (X %*% t(mu * precisions)) +
X^2 %*% t(precisions))
}
else if (model == "VII") {
prec = prec_chol
precisions = prec^ 2
bc_matrix = matrix(1,nrow(X), nrow(mu))
log_prob = (bc_matrix * t(rowSums(mu^2) * precisions) -
2 * X %*% t(mu * precisions) +
rowSums(X*X) %*% t(precisions) ) # TODO replace rowSums(X*X) with squared rowNorm() built-in
}
if(ncol(log_det) == 1)
log_det = matrix(1, 1, ncol(log_prob)) * log_det
es_log_prob = -.5 * (ncol(X) * log(2 * pi) + log_prob) + log_det
}
compute_log_det_cholesky = function(Matrix[Double] mat_chol, String model, Integer d)
return(Matrix[Double] log_det_cholesky)
{
comp = nrow(mat_chol)/ncol(mat_chol)
if(model == "VVV") {
log_det_chol = matrix(0, 1, comp)
i = 1
for(k in 1:comp) {
mat = mat_chol[i:(k*ncol(mat_chol))]
log_det = sum(log(diag(t(mat)))) # have to take the log of diag elements only
log_det_chol[1,k] = log_det
i = i + ncol(mat_chol)
}
}
else if(model == "EEE")
log_det_chol = as.matrix(sum(log(diag(mat_chol))))
else if(model == "VVI")
log_det_chol = t(rowSums(log(mat_chol)))
else if (model == "VII")
log_det_chol = t(d * log(mat_chol))
log_det_cholesky = log_det_chol
}
# compute the number of estimated parameters
estimate_free_param = function(Integer n_components, Integer n_features, String model)
return (Integer n_parameters)
{
if(model == "VVV")
cov_param = n_components * n_features * (n_features + 1) / 2
else if(model == "EEE")
cov_param = n_features * (n_features + 1) / 2
else if (model == "VVI")
cov_param = n_components * n_features
else if (model == "VII")
cov_param = n_components
else
stop("invalid model expecting any of [VVV,EEE,VVI,VII], found "+model)
mean_param = n_features * n_components
n_parameters = as.integer( cov_param + mean_param + n_components - 1 )
}
getBIC = function(Integer n, Double norm, Integer df)
return(Double bic)
{
bic = -2 * norm * n + df * log(n)
}
# check if covariance matrix is symmetric and positive definite
checkSPD = function(Matrix[Double] A)
return(Boolean isSPD)
{
# abs(a - t(a)) <= (absoluteTolerance + relativeTolerance * abs(b))
sym = abs(A - t(A)) <= (1e-4 + abs(t(A)))
if(sum(sym == 0) == 0) {
[eval, evec] = eigen(A);
#check positive definite
isSPD = (sum(eval < 0) < 1e-4)
}
else isSPD = FALSE
# isSPD = TRUE
}
choleskymatrix = function(Matrix[Double] m)
return(Matrix[Double] L)
{
rows = nrow(m)
cols = ncol(m)
L = diag(matrix(0, rows, 1))
for(i in 1:rows) {
for(k in 1:i) {
sum = sum(L[1:k, i] * L[1:k, k])
if(i == k)
L[k, i] = sqrt(m[i, i] - sum)
else
L[k, i] = (m[k, i] - sum) / L[k, k]
}
}
L = t(L)
}