[SYSTEMDS-2710] Fix K-Means and GMM builtin functions, robustness IPA This patch fixes various issues related to the recent addition of seeds to the Kmeans builtin function. First, the GMM built-in function was failing in IPA because the GMM call to Kmeans used by-position arguments and thus missed the new seed argument. We now use named parameters to guard against future additions. Second, the error handling in such cases of mismatching numbers of arguments was horrible (failing in inter-procedural analysis index out of bounds exceptions). We now use a more robust handling in IPA such that the user get the intended, error message explaining the problem. Third, the kmeans addition of seeds used a uniform random matrix and incrementally added uniform random matrices per centroid. Adding multiple uniformly distributed random variables, gives a normally distributed random variable. For K-Means initialization this is not desirable and ultimately led to the flaky python test failures.

commit: 3f8ff967da5e8b01d9e04604290646d1b748d20d [log] [tgz]
author: Matthias Boehm <mboehm7@gmail.com> Sat Oct 31 23:05:47 2020 +0100
committer: Matthias Boehm <mboehm7@gmail.com> Sat Oct 31 23:06:44 2020 +0100
tree: a5e963c9cc61668a6d7051a6addd4d97c2c9fc81
parent: e96e1cbafc53e90f64c08899cccbc1f02a9e46b3 [diff]
diff --git a/scripts/builtin/gmm.dml b/scripts/builtin/gmm.dml
index 2eafe13..7c327ca 100644
--- a/scripts/builtin/gmm.dml
+++ b/scripts/builtin/gmm.dml

@@ -54,8 +54,7 @@
 
 m_gmm = function(Matrix[Double] X, Integer n_components = 1, String model = "VVV", String init_params = "kmeans", Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Boolean verbose = FALSE )
 return (Matrix[Double] weights, Matrix[Double] labels, Integer df, Double bic)
-{ 
-
+{
   # sanity checks
   if(model != "VVV" & model != "EEE" & model != "VVI" & model != "VII")
     stop("model not supported, should be in VVV, EEE, VVI, VII");
@@ -72,13 +71,14 @@
   resp = matrix(0, nrow(X), n_components)
   if(init_params == "kmeans")
   {
-    [C, Y] = kmeans(X, n_components, 10, 10, tol, FALSE, 25)
+    [C, Y] = kmeans(X=X, k=n_components, runs=10, max_iter=10, 
+                    eps=tol, is_verbose=FALSE, avg_sample_size_per_centroid=25)
     resp = resp + t(seq(1,n_components))
     resp = resp == Y
   }
   else if(init_params == "random")
   {
-    resp = Rand(rows = nrow(X), cols=n_components)   
+    resp = Rand(rows = nrow(X), cols=n_components)
     resp = resp/rowSums(resp)
   }
   else stop("invalid parameter value, expected kmeans or random found "+init_params) 
@@ -109,7 +109,7 @@
   mu = (t(resp) %*% X) / t(nk)
   sigma = list()
   if(model == "VVV")
-    sigma = covariances_VVV(X, resp, mu, nk, reg_covar) 
+    sigma = covariances_VVV(X, resp, mu, nk, reg_covar)
   else if(model == "EEE")
     sigma = covariances_EEE(X, resp, mu, nk, reg_covar)
   else if(model ==  "VVI")
@@ -118,7 +118,7 @@
     sigma = covariances_VII(X, resp, mu, nk, reg_covar)
     
   weight = nk
-  mean = mu # n_components * n_features     
+  mean = mu # n_components * n_features
 }
 
 # Matrix/Vector Parameters/List
@@ -128,13 +128,12 @@
 return(List[Unknown] sigma)
 {
   sigma = list()
-  for(k in 1:nrow(mu))
-  {
+  for(k in 1:nrow(mu)) {
     diff = X - mu[k,]
     cov = (t(diff * resp[, k]) %*% diff) / as.scalar(nk[1,k])
     cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
     sigma = append(sigma, cov)
-  }       
+  }
 }
 
 # Matrix/Vector Parameters/List
@@ -192,12 +191,12 @@
       isSPD = checkSPD(cov)
       if(isSPD) {
         cov_chol = cholesky(cov)
-        pre_chol = t(inv(cov_chol))                      
+        pre_chol = t(inv(cov_chol))
         precision_chol = append(precision_chol, pre_chol)
       } else 
         stop("Fitting the mixture model failed because some components have ill-defined empirical covariance (i.e., singleton matrix or non-symmetric )."+ 
         "\nTry to decrease the number of components, or increase reg_covar")
-    }      
+    }
   }
   else if(model == "EEE") {
     cov = as.matrix(sigma[1])
@@ -217,7 +216,7 @@
       "\nTry to decrease the number of components, or increase reg_covar")
     else {
       precision_chol = append(precision_chol, 1.0/sqrt(cov))
-    }   
+    }
   }
 }
 
@@ -244,7 +243,7 @@
 {
   weight_log_pro = estimate_log_prob(X, mu, precisions_cholesky, model) + estimate_log_weights(w)
 }
-  
+
 estimate_log_weights = function(Matrix[Double] w)
 return (Matrix[Double] log_weight)
 {
@@ -263,28 +262,23 @@
 {
   comp = length(mat_chol)
 
-  if(model == "VVV")
-  { 
-    log_det_chol = matrix(0, 1, comp) 
-    for(k in 1:comp)
-    {
+  if(model == "VVV") {
+    log_det_chol = matrix(0, 1, comp)
+    for(k in 1:comp) {
       mat = as.matrix(mat_chol[k])
       log_det = sum(log(diag(t(mat))))   # have to take the log of diag elements only
       log_det_chol[1,k] = log_det
-    }      
+    }
   }
-  else if(model == "EEE")
-  { 
+  else if(model == "EEE") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = as.matrix(sum(log(diag(mat))))
   }
-  else if(model ==  "VVI")
-  {
+  else if(model ==  "VVI") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = t(rowSums(log(mat)))
   }
-  else if (model == "VII")
-  {
+  else if (model == "VII") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = t(d * log(mat))
   }
@@ -299,27 +293,23 @@
   n_components = nrow(mu)
 
   log_det = compute_log_det_cholesky(prec_chol, model, d)
-  if(model == "VVV")
-  { 
-    log_prob = matrix(0, n, n_components) 
-    for(k in 1:n_components)
-    {
+  if(model == "VVV") {
+    log_prob = matrix(0, n, n_components)
+    for(k in 1:n_components) {
       prec = as.matrix(prec_chol[k]) 
       y = X %*% prec - mu[k,] %*% prec  # changing here t intro:  y = X %*% prec - mu[k,] %*% prec 
       log_prob[, k] = rowSums(y*y)
-    }      
+    }
   }
-  else if(model == "EEE")
-  { 
-    log_prob = matrix(0, n, n_components) 
+  else if(model == "EEE") {
+    log_prob = matrix(0, n, n_components)
     prec = as.matrix(prec_chol[1])
     for(k in 1:n_components) {
       y = X %*% prec - mu[k,] %*% prec
       log_prob[, k] = rowSums(y*y) # TODO replace y*y with squared built-in
     }
   }
-  else if(model ==  "VVI")
-  {
+  else if(model ==  "VVI") {
     prec = as.matrix(prec_chol[1])
     precisions = prec^2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
@@ -327,8 +317,7 @@
                     2. * (X %*% t(mu * precisions)) +
                     X^2 %*% t(precisions))
   }
-  else if (model == "VII")
-  {
+  else if (model == "VII") {
     prec = as.matrix(prec_chol[1])
     precisions = prec^ 2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
@@ -367,13 +356,11 @@
   mean_param = n_features * n_components
   
   n_parameters = as.integer( cov_param + mean_param + n_components - 1 )
-
 }
 
 fit = function(Matrix[Double] X, Integer n_components, String model, String init_params, Integer iter , Double reg_covar, Double tol)
 return (Matrix[Double] label, Matrix[Double] predict_prob, Double log_prob_norm)
 {
-
   lower_bound = 0
   converged = FALSE
   n = nrow(X)
@@ -387,7 +374,7 @@
     change = lower_bound - prev_lower_bound
     if(abs(change) < tol)
       converged = TRUE
-    i = i+1   
+    i = i+1
   }
   [log_prob_norm, log_resp] = e_step(X,weight, mean, precision_chol, model)
   label = rowIndexMax(log_resp)
@@ -415,4 +402,3 @@
   }
   else isSPD = FALSE
 }
-

diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index ee66e3d..3567dc3 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml

@@ -85,8 +85,6 @@
 
   min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at random
 
-  seed = ifelse(seed == -1, -1, seed + 1)
-  random_row = rand(rows = 1, cols = num_runs, seed = seed);
   for (i in 1 : num_centroids)
   {
     # "Matricize" and prefix-sum to compute the cumulative distribution function:
@@ -95,7 +93,8 @@
 
     # Select the i-th centroid in each sample as a random sample row id with
     # probability ~ min_distances:
-    random_row = random_row + rand(rows = 1, cols = 1, seed = seed + i)
+    lseed = ifelse(seed==-1, -1, seed + i);
+    random_row = rand(rows = 1, cols = num_runs, seed = lseed)
     threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
     centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;
 

diff --git a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
index 579af77..14556cb 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java

@@ -521,8 +521,8 @@
 		ArrayList<Hop> inputOps = fop.getInput();
 		String fkey = fop.getFunctionKey();
 		
-		for( int i=0; i<funArgNames.length; i++ )
-		{
+		//iterate over all parameters (with robustness for missing parameters)
+		for( int i=0; i<Math.min(inputOps.size(), funArgNames.length); i++ ) {
 			//create mapping between input hops and vars
 			DataIdentifier dat = fstmt.getInputParam(funArgNames[i]);
 			Hop input = inputOps.get(i);
commit	3f8ff967da5e8b01d9e04604290646d1b748d20d	[log] [tgz]
author	Matthias Boehm <mboehm7@gmail.com>	Sat Oct 31 23:05:47 2020 +0100
committer	Matthias Boehm <mboehm7@gmail.com>	Sat Oct 31 23:06:44 2020 +0100
tree	a5e963c9cc61668a6d7051a6addd4d97c2c9fc81
parent	e96e1cbafc53e90f64c08899cccbc1f02a9e46b3 [diff]