scripts/builtin/gmm.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # ------------------------------------------
 # Gaussian Mixture Model
 # ------------------------------------------

 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE     DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double   ---         Matrix X
 # n_components    Integer  3           Number of n_components in the Gaussian mixture model
 # model           String   "VVV"       "VVV": unequal variance (full),each component has its own general covariance matrix
 #                                      "EEE": equal variance (tied), all components share the same general covariance matrix
 #                                      "VVI": spherical, unequal volume (diag), each component has its own diagonal
 #                                             covariance matrix
 #                                      "VII": spherical, equal volume (spherical), each component has its own single variance
 # init_param      String   "kmeans"    initialize weights with "kmeans" or "random"
 # iterations      Integer  100         Number of iterations
 # reg_covar       Double   1e-6        regularization parameter for covariance matrix
 # tol             Double   0.000001    tolerance value for convergence
 # ---------------------------------------------------------------------------------------------


 #Output(s)
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE     DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # weight          Double   ---         A matrix whose [i,k]th entry is the probability that observation i in the test data
 #                                      belongs to the kth class
 # labels          Double   ---         Prediction matrix
 # df              Integer  ---         Number of estimated parameters
 # bic             Double   ---         Bayesian information criterion for best iteration


 m_gmm = function(Matrix[Double] X, Integer n_components = 3, String model = "VVV", String init_params = "kmeans",
   Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Integer seed = -1, Boolean verbose = FALSE )
 return (Matrix[Double] labels, Matrix[Double] predict_prob, Integer df, Double bic,
   Matrix[Double] mu, Matrix[Double] prec_chol, Matrix[Double] weight)
 {
   # sanity checks
   if(model != "VVV" & model != "EEE" &    model != "VVI" & model != "VII")
     stop("model not supported, should be in VVV, EEE, VVI, VII");

   [labels, predict_prob, norm, mu, prec_chol, weight] = fit(X, n_components,
     model, init_params, iter, reg_covar, tol, seed,  verbose)
   df = estimate_free_param(n_components, ncol(X), model)
   bic = getBIC(nrow(X), norm, df)
 }

 fit = function(Matrix[Double] X, Integer n_components, String model, String init_params,
   Integer iter, Double reg_covar, Double tol, Integer seed, Boolean verbose)
 return (Matrix[Double] label, Matrix[Double] predict_prob, Double log_prob_norm,
     Matrix[Double] mean, Matrix[Double] precision_chol, Matrix[Double] weight)
 {
   et = FALSE
   lower_bound = 0
   converged = FALSE
   [weight, mean, sigma, precision_chol] = initialize_param(X, n_components,init_params, model, reg_covar, tol, seed)
   i = 1
   while(i <= iter & !converged & !et) {
     prev_lower_bound = lower_bound
     [log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
     [weight, mean, sigma, precision_chol, et] = m_step(X, log_resp, n_components, model, reg_covar)
     lower_bound = log_prob_norm
     change = lower_bound - prev_lower_bound
     converged = (abs(change) < tol)
     if(verbose) {
       print("executing " +i+" iteration")
       print("converged " +converged)
       print("diff: "+(abs(change))+" tol: "+tol)
     }
     i = i+1
   }
   if(et) {
     print("warning: did not converge because some components have ill-defined empirical covariance
     (i.e., singleton matrix or non-symmetric).
     \nTry to decrease the number of components, or increase reg_covar")
     label = rowIndexMax(weighted_log_prob)
     predict_prob = exp(log_resp)
   }
   else {
     [log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
     label = rowIndexMax(weighted_log_prob)
     predict_prob = exp(log_resp)
   }

 }

 initialize_param = function(Matrix[Double] X, Integer n_components, String init_params,
   String model, Double reg_covar, Double tol , Integer seed)
 return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol)
 {
   # create responsibility matrix, resp[n_samples, n_components]
   resp = matrix(0, nrow(X), n_components)
   if(init_params == "kmeans") {
     [C, Y] = kmeans(X=X, k=n_components, runs=10,
       eps=tol, is_verbose=FALSE, avg_sample_size_per_centroid=100)
     resp = ((resp + t(seq(1, n_components))) == Y)
   }
   else if(init_params == "random") {
     resp = Rand(rows = nrow(X), cols=n_components, seed=seed)
     resp = resp/rowSums(resp)
   }
   else stop("invalid parameter value, expected kmeans or random found "+init_params)

   [weight, mean, sigma] = estimate_gaussian_param(X, resp, n_components, model, reg_covar)
   weight = weight/nrow(X)
   [precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
   if(et)
     stop("Fitting the mixture model failed because some components have ill-defined empirical covariance
     (i.e., singleton matrix or non-symmetric).
     \nTry to decrease the number of components, or increase reg_covar")
 }

 estimate_gaussian_param = function(Matrix[Double] X, Matrix[Double] resp,
   Integer n_components, String model, Double reg_covar)
 return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma)
 {
   MACHINE_PRECISION = 2.22e-16
   # estimate Gaussian parameter
   weight = colSums(resp) + MACHINE_PRECISION # adding machine precision
   mean = (t(resp) %*% X) / t(weight) # mean dims:  n_components * n_features

   if(model == "VVV") {
   # output: (sigma a list of length = n_components where each item in list is a covariance matrix of (
   # n_features * n_features) dimensions) all rbind in a matrix form
     sigma = matrix(0, 0, ncol(X))
     for(k in 1:nrow(mean)) {
       diff = X - mean[k,]
       cov = (t(diff * resp[, k]) %*% diff) / as.scalar(weight[1,k])
       cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
       sigma = rbind(sigma, cov)
     }
   }
   else if(model == "EEE") {
   # output: (sigma a list of length = 1 where  item in list is a covariance matrix of (n_features * n_features) dimensions)
   # all rbind in a matrix form
     avgX2 = t(X) %*% X
     avgMean = (t(mean) * weight) %*% mean
     cov = avgX2 - avgMean
     cov = cov / sum(weight)
     cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
     sigma = cov
   }
   else if(model ==  "VVI") {
   # output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_components * n_features) dimensions)
     avgX2 = (t(resp) %*% (X*X)) / t(weight)
     avgMean = mean ^ 2
     avgMean2 = mean * (t(resp) %*% X) / t(weight)
     cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
     sigma = cov
   }
   else if (model == "VII") {
   # output: (sigma a list of length = 1 where item in list is a variance value for each component (1* n_components) dimensions)
     avgX2 = (t(resp) %*% (X*X)) / t(weight)
     avgMean = mean ^ 2
     avgMean2 = mean * (t(resp) %*% X) / t(weight)
     cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
     sigma = rowMeans(cov)
   }
 }

 compute_precision_cholesky = function(Matrix[Double] sigma, String model, Integer n_components)
 return (Matrix[Double] precision_chol, Boolean earlyTermination )
 {
   earlyTermination = FALSE
   if(model == "VVV") {
     index = 1; k = 1
     precision_chol = matrix(0, 0, ncol(sigma))
     while(k <= n_components) {
       cov = sigma[index:(ncol(sigma)*k), ]
       isSPD = checkSPD(cov)
       if(isSPD) {
         cov_chol = choleskymatrix(cov)
         pre_chol = t(inv(cov_chol))
         precision_chol = rbind(precision_chol, pre_chol)
         index = index + ncol(sigma)
         k = k+1
       } else {
         earlyTermination = TRUE;
         k = n_components + 1
       }
     }
   }
   else if(model == "EEE") {
     cov = sigma
     isSPD = checkSPD(cov)
     if(isSPD) {
       cov_chol = cholesky(cov)
       pre_chol = t(inv(cov_chol))
       precision_chol = pre_chol
     } else
       earlyTermination = TRUE
   }
   else {
     cov = sigma
     if(sum(cov <= 0) > 0)
       earlyTermination = TRUE
     else {
       precision_chol = 1.0/sqrt(cov)
     }
   }
 }

 # Expectation step
 e_step = function(Matrix[Double] X, Matrix[Double] w, Matrix[Double] mu,
   Matrix[Double] precisions_cholesky, String model)
   return(Double norm, Matrix[Double] log_resp, Matrix[Double] weighted_log_prob)
 {
   weighted_log_prob =  estimate_log_gaussian_prob(X, mu, precisions_cholesky, model) + log(w)
   log_prob_norm = logSumExp(weighted_log_prob, "rows")
   log_resp = weighted_log_prob - log_prob_norm
   norm = mean(log_prob_norm)
 }

 # maximization Step
 m_step = function(Matrix[Double] X, Matrix[Double] log_resp, Integer n_components, String model, Double reg_covar)
   return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol, Boolean et)
 {
   [weight, mean, sigma] = estimate_gaussian_param(X, exp(log_resp), n_components, model, reg_covar)
   weight = weight/nrow(X)
   [precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
 }

 estimate_log_gaussian_prob = function(Matrix[Double] X, Matrix[Double] mu, Matrix[Double] prec_chol, String model)
   return(Matrix[Double] es_log_prob ) # nrow(X) * n_components
 {
   n_components = nrow(mu)

   log_det = compute_log_det_cholesky(prec_chol, model, ncol(X))
   if(model == "VVV") {
     log_prob = matrix(0, nrow(X), n_components)
     i = 1
     for(k in 1:n_components) {
       prec = prec_chol[i:(k*ncol(X)),]
       y = X %*% prec - mu[k,] %*% prec  # changing here t intro:  y = X %*% prec - mu[k,] %*% prec
       log_prob[, k] = rowSums(y*y)
       i = i + ncol(X)
     }
   }
   else if(model == "EEE") {
     log_prob = matrix(0, nrow(X), n_components)
     prec = prec_chol
     for(k in 1:n_components) {
       y = X %*% prec - mu[k,] %*% prec
       log_prob[, k] = rowSums(y*y) # TODO replace y*y with squared built-in
     }
   }
   else if(model ==  "VVI") {
     prec = prec_chol
     precisions = prec^2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
     log_prob = (bc_matrix*t(rowSums(mu^2 * precisions)) -
                     2. * (X %*% t(mu * precisions)) +
                     X^2 %*% t(precisions))
   }
   else if (model == "VII") {
     prec = prec_chol
     precisions = prec^ 2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
     log_prob = (bc_matrix * t(rowSums(mu^2) * precisions) -
                     2 * X %*% t(mu * precisions) +
                     rowSums(X*X) %*% t(precisions) ) # TODO replace rowSums(X*X) with squared rowNorm() built-in
   }
   if(ncol(log_det) == 1)
     log_det = matrix(1, 1, ncol(log_prob)) * log_det
   es_log_prob = -.5 * (ncol(X) * log(2 * pi) + log_prob) + log_det
 }

 compute_log_det_cholesky = function(Matrix[Double] mat_chol, String model, Integer d)
   return(Matrix[Double] log_det_cholesky)
 {
   comp = nrow(mat_chol)/ncol(mat_chol)

   if(model == "VVV") {
     log_det_chol = matrix(0, 1, comp)
     i = 1
     for(k in 1:comp) {
       mat = mat_chol[i:(k*ncol(mat_chol))]
       log_det = sum(log(diag(t(mat))))   # have to take the log of diag elements only
       log_det_chol[1,k] = log_det
       i = i + ncol(mat_chol)
     }
   }
   else if(model == "EEE")
     log_det_chol = as.matrix(sum(log(diag(mat_chol))))
   else if(model ==  "VVI")
     log_det_chol = t(rowSums(log(mat_chol)))
   else if (model == "VII")
     log_det_chol = t(d * log(mat_chol))

   log_det_cholesky = log_det_chol
 }

 # compute the number of estimated parameters
 estimate_free_param = function(Integer n_components, Integer n_features, String model)
   return (Integer n_parameters)
 {
   if(model == "VVV")
     cov_param = n_components * n_features * (n_features + 1) / 2
   else if(model == "EEE")
     cov_param = n_features * (n_features + 1) / 2
   else if (model == "VVI")
     cov_param = n_components * n_features
   else if (model == "VII")
     cov_param = n_components
   else
     stop("invalid model expecting any of [VVV,EEE,VVI,VII], found "+model)
   mean_param = n_features * n_components

   n_parameters = as.integer( cov_param + mean_param + n_components - 1 )
 }

 getBIC = function(Integer n, Double norm, Integer df)
   return(Double bic)
 {
   bic = -2 * norm * n + df * log(n)
 }

 # check if covariance matrix is symmetric and positive definite
 checkSPD = function(Matrix[Double] A)
   return(Boolean isSPD)
 {
   # abs(a - t(a)) <= (absoluteTolerance + relativeTolerance * abs(b))
   sym = abs(A - t(A)) <= (1e-4 + abs(t(A)))
   if(sum(sym == 0) == 0) {
     [eval, evec] = eigen(A);
     #check positive definite
     isSPD = (sum(eval < 0) < 1e-4)
   }
   else isSPD = FALSE
   # isSPD = TRUE
 }

 choleskymatrix = function(Matrix[Double] m)
   return(Matrix[Double] L)
 {
   rows = nrow(m)
   cols = ncol(m)
   L = diag(matrix(0, rows, 1))
   for(i in 1:rows) {
     for(k in 1:i) {
       sum = sum(L[1:k, i] * L[1:k, k])
       if(i == k)
         L[k, i] = sqrt(m[i, i] - sum)
       else
         L[k, i] = (m[k, i] - sum) / L[k, k]
     }
   }
   L = t(L)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# ------------------------------------------
	# Gaussian Mixture Model
	# ------------------------------------------

	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double --- Matrix X
	# n_components Integer 3 Number of n_components in the Gaussian mixture model
	# model String "VVV" "VVV": unequal variance (full),each component has its own general covariance matrix
	# "EEE": equal variance (tied), all components share the same general covariance matrix
	# "VVI": spherical, unequal volume (diag), each component has its own diagonal
	# covariance matrix
	# "VII": spherical, equal volume (spherical), each component has its own single variance
	# init_param String "kmeans" initialize weights with "kmeans" or "random"
	# iterations Integer 100 Number of iterations
	# reg_covar Double 1e-6 regularization parameter for covariance matrix
	# tol Double 0.000001 tolerance value for convergence
	# ---------------------------------------------------------------------------------------------


	#Output(s)
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# weight Double --- A matrix whose [i,k]th entry is the probability that observation i in the test data
	# belongs to the kth class
	# labels Double --- Prediction matrix
	# df Integer --- Number of estimated parameters
	# bic Double --- Bayesian information criterion for best iteration


	m_gmm = function(Matrix[Double] X, Integer n_components = 3, String model = "VVV", String init_params = "kmeans",
	Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Integer seed = -1, Boolean verbose = FALSE )
	return (Matrix[Double] labels, Matrix[Double] predict_prob, Integer df, Double bic,
	Matrix[Double] mu, Matrix[Double] prec_chol, Matrix[Double] weight)
	{
	# sanity checks
	if(model != "VVV" & model != "EEE" & model != "VVI" & model != "VII")
	stop("model not supported, should be in VVV, EEE, VVI, VII");

	[labels, predict_prob, norm, mu, prec_chol, weight] = fit(X, n_components,
	model, init_params, iter, reg_covar, tol, seed, verbose)
	df = estimate_free_param(n_components, ncol(X), model)
	bic = getBIC(nrow(X), norm, df)
	}

	fit = function(Matrix[Double] X, Integer n_components, String model, String init_params,
	Integer iter, Double reg_covar, Double tol, Integer seed, Boolean verbose)
	return (Matrix[Double] label, Matrix[Double] predict_prob, Double log_prob_norm,
	Matrix[Double] mean, Matrix[Double] precision_chol, Matrix[Double] weight)
	{
	et = FALSE
	lower_bound = 0
	converged = FALSE
	[weight, mean, sigma, precision_chol] = initialize_param(X, n_components,init_params, model, reg_covar, tol, seed)
	i = 1
	while(i <= iter & !converged & !et) {
	prev_lower_bound = lower_bound
	[log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
	[weight, mean, sigma, precision_chol, et] = m_step(X, log_resp, n_components, model, reg_covar)
	lower_bound = log_prob_norm
	change = lower_bound - prev_lower_bound
	converged = (abs(change) < tol)
	if(verbose) {
	print("executing " +i+" iteration")
	print("converged " +converged)
	print("diff: "+(abs(change))+" tol: "+tol)
	}
	i = i+1
	}
	if(et) {
	print("warning: did not converge because some components have ill-defined empirical covariance
	(i.e., singleton matrix or non-symmetric).
	\nTry to decrease the number of components, or increase reg_covar")
	label = rowIndexMax(weighted_log_prob)
	predict_prob = exp(log_resp)
	}
	else {
	[log_prob_norm, log_resp, weighted_log_prob] = e_step(X, weight, mean, precision_chol, model)
	label = rowIndexMax(weighted_log_prob)
	predict_prob = exp(log_resp)
	}

	}

	initialize_param = function(Matrix[Double] X, Integer n_components, String init_params,
	String model, Double reg_covar, Double tol , Integer seed)
	return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol)
	{
	# create responsibility matrix, resp[n_samples, n_components]
	resp = matrix(0, nrow(X), n_components)
	if(init_params == "kmeans") {
	[C, Y] = kmeans(X=X, k=n_components, runs=10,
	eps=tol, is_verbose=FALSE, avg_sample_size_per_centroid=100)
	resp = ((resp + t(seq(1, n_components))) == Y)
	}
	else if(init_params == "random") {
	resp = Rand(rows = nrow(X), cols=n_components, seed=seed)
	resp = resp/rowSums(resp)
	}
	else stop("invalid parameter value, expected kmeans or random found "+init_params)

	[weight, mean, sigma] = estimate_gaussian_param(X, resp, n_components, model, reg_covar)
	weight = weight/nrow(X)
	[precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
	if(et)
	stop("Fitting the mixture model failed because some components have ill-defined empirical covariance
	(i.e., singleton matrix or non-symmetric).
	\nTry to decrease the number of components, or increase reg_covar")
	}

	estimate_gaussian_param = function(Matrix[Double] X, Matrix[Double] resp,
	Integer n_components, String model, Double reg_covar)
	return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma)
	{
	MACHINE_PRECISION = 2.22e-16
	# estimate Gaussian parameter
	weight = colSums(resp) + MACHINE_PRECISION # adding machine precision
	mean = (t(resp) %% X) / t(weight) # mean dims: n_components n_features

	if(model == "VVV") {
	# output: (sigma a list of length = n_components where each item in list is a covariance matrix of (
	# n_features * n_features) dimensions) all rbind in a matrix form
	sigma = matrix(0, 0, ncol(X))
	for(k in 1:nrow(mean)) {
	diff = X - mean[k,]
	cov = (t(diff * resp[, k]) %*% diff) / as.scalar(weight[1,k])
	cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
	sigma = rbind(sigma, cov)
	}
	}
	else if(model == "EEE") {
	# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_features * n_features) dimensions)
	# all rbind in a matrix form
	avgX2 = t(X) %*% X
	avgMean = (t(mean) * weight) %*% mean
	cov = avgX2 - avgMean
	cov = cov / sum(weight)
	cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
	sigma = cov
	}
	else if(model == "VVI") {
	# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_components * n_features) dimensions)
	avgX2 = (t(resp) %% (XX)) / t(weight)
	avgMean = mean ^ 2
	avgMean2 = mean * (t(resp) %*% X) / t(weight)
	cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
	sigma = cov
	}
	else if (model == "VII") {
	# output: (sigma a list of length = 1 where item in list is a variance value for each component (1* n_components) dimensions)
	avgX2 = (t(resp) %% (XX)) / t(weight)
	avgMean = mean ^ 2
	avgMean2 = mean * (t(resp) %*% X) / t(weight)
	cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar
	sigma = rowMeans(cov)
	}
	}

	compute_precision_cholesky = function(Matrix[Double] sigma, String model, Integer n_components)
	return (Matrix[Double] precision_chol, Boolean earlyTermination )
	{
	earlyTermination = FALSE
	if(model == "VVV") {
	index = 1; k = 1
	precision_chol = matrix(0, 0, ncol(sigma))
	while(k <= n_components) {
	cov = sigma[index:(ncol(sigma)*k), ]
	isSPD = checkSPD(cov)
	if(isSPD) {
	cov_chol = choleskymatrix(cov)
	pre_chol = t(inv(cov_chol))
	precision_chol = rbind(precision_chol, pre_chol)
	index = index + ncol(sigma)
	k = k+1
	} else {
	earlyTermination = TRUE;
	k = n_components + 1
	}
	}
	}
	else if(model == "EEE") {
	cov = sigma
	isSPD = checkSPD(cov)
	if(isSPD) {
	cov_chol = cholesky(cov)
	pre_chol = t(inv(cov_chol))
	precision_chol = pre_chol
	} else
	earlyTermination = TRUE
	}
	else {
	cov = sigma
	if(sum(cov <= 0) > 0)
	earlyTermination = TRUE
	else {
	precision_chol = 1.0/sqrt(cov)
	}
	}
	}

	# Expectation step
	e_step = function(Matrix[Double] X, Matrix[Double] w, Matrix[Double] mu,
	Matrix[Double] precisions_cholesky, String model)
	return(Double norm, Matrix[Double] log_resp, Matrix[Double] weighted_log_prob)
	{
	weighted_log_prob = estimate_log_gaussian_prob(X, mu, precisions_cholesky, model) + log(w)
	log_prob_norm = logSumExp(weighted_log_prob, "rows")
	log_resp = weighted_log_prob - log_prob_norm
	norm = mean(log_prob_norm)
	}

	# maximization Step
	m_step = function(Matrix[Double] X, Matrix[Double] log_resp, Integer n_components, String model, Double reg_covar)
	return (Matrix[Double] weight, Matrix[Double] mean, Matrix[Double] sigma, Matrix[Double] precision_chol, Boolean et)
	{
	[weight, mean, sigma] = estimate_gaussian_param(X, exp(log_resp), n_components, model, reg_covar)
	weight = weight/nrow(X)
	[precision_chol, et] = compute_precision_cholesky(sigma, model, n_components)
	}

	estimate_log_gaussian_prob = function(Matrix[Double] X, Matrix[Double] mu, Matrix[Double] prec_chol, String model)
	return(Matrix[Double] es_log_prob ) # nrow(X) * n_components
	{
	n_components = nrow(mu)

	log_det = compute_log_det_cholesky(prec_chol, model, ncol(X))
	if(model == "VVV") {
	log_prob = matrix(0, nrow(X), n_components)
	i = 1
	for(k in 1:n_components) {
	prec = prec_chol[i:(k*ncol(X)),]
	y = X %% prec - mu[k,] %% prec # changing here t intro: y = X %% prec - mu[k,] %% prec
	log_prob[, k] = rowSums(y*y)
	i = i + ncol(X)
	}
	}
	else if(model == "EEE") {
	log_prob = matrix(0, nrow(X), n_components)
	prec = prec_chol
	for(k in 1:n_components) {
	y = X %% prec - mu[k,] %% prec
	log_prob[, k] = rowSums(yy) # TODO replace yy with squared built-in
	}
	}
	else if(model == "VVI") {
	prec = prec_chol
	precisions = prec^2
	bc_matrix = matrix(1,nrow(X), nrow(mu))
	log_prob = (bc_matrixt(rowSums(mu^2 precisions)) -
	2. * (X %% t(mu precisions)) +
	X^2 %*% t(precisions))
	}
	else if (model == "VII") {
	prec = prec_chol
	precisions = prec^ 2
	bc_matrix = matrix(1,nrow(X), nrow(mu))
	log_prob = (bc_matrix * t(rowSums(mu^2) * precisions) -
	2 * X %% t(mu precisions) +
	rowSums(XX) %% t(precisions) ) # TODO replace rowSums(X*X) with squared rowNorm() built-in
	}
	if(ncol(log_det) == 1)
	log_det = matrix(1, 1, ncol(log_prob)) * log_det
	es_log_prob = -.5 * (ncol(X) * log(2 * pi) + log_prob) + log_det
	}

	compute_log_det_cholesky = function(Matrix[Double] mat_chol, String model, Integer d)
	return(Matrix[Double] log_det_cholesky)
	{
	comp = nrow(mat_chol)/ncol(mat_chol)

	if(model == "VVV") {
	log_det_chol = matrix(0, 1, comp)
	i = 1
	for(k in 1:comp) {
	mat = mat_chol[i:(k*ncol(mat_chol))]
	log_det = sum(log(diag(t(mat)))) # have to take the log of diag elements only
	log_det_chol[1,k] = log_det
	i = i + ncol(mat_chol)
	}
	}
	else if(model == "EEE")
	log_det_chol = as.matrix(sum(log(diag(mat_chol))))
	else if(model == "VVI")
	log_det_chol = t(rowSums(log(mat_chol)))
	else if (model == "VII")
	log_det_chol = t(d * log(mat_chol))

	log_det_cholesky = log_det_chol
	}

	# compute the number of estimated parameters
	estimate_free_param = function(Integer n_components, Integer n_features, String model)
	return (Integer n_parameters)
	{
	if(model == "VVV")
	cov_param = n_components * n_features * (n_features + 1) / 2
	else if(model == "EEE")
	cov_param = n_features * (n_features + 1) / 2
	else if (model == "VVI")
	cov_param = n_components * n_features
	else if (model == "VII")
	cov_param = n_components
	else
	stop("invalid model expecting any of [VVV,EEE,VVI,VII], found "+model)
	mean_param = n_features * n_components

	n_parameters = as.integer( cov_param + mean_param + n_components - 1 )
	}

	getBIC = function(Integer n, Double norm, Integer df)
	return(Double bic)
	{
	bic = -2 * norm * n + df * log(n)
	}

	# check if covariance matrix is symmetric and positive definite
	checkSPD = function(Matrix[Double] A)
	return(Boolean isSPD)
	{
	# abs(a - t(a)) <= (absoluteTolerance + relativeTolerance * abs(b))
	sym = abs(A - t(A)) <= (1e-4 + abs(t(A)))
	if(sum(sym == 0) == 0) {
	[eval, evec] = eigen(A);
	#check positive definite
	isSPD = (sum(eval < 0) < 1e-4)
	}
	else isSPD = FALSE
	# isSPD = TRUE
	}

	choleskymatrix = function(Matrix[Double] m)
	return(Matrix[Double] L)
	{
	rows = nrow(m)
	cols = ncol(m)
	L = diag(matrix(0, rows, 1))
	for(i in 1:rows) {
	for(k in 1:i) {
	sum = sum(L[1:k, i] * L[1:k, k])
	if(i == k)
	L[k, i] = sqrt(m[i, i] - sum)
	else
	L[k, i] = (m[k, i] - sum) / L[k, k]
	}
	}
	L = t(L)
	}