scripts/algorithms/Kmeans.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # Implements the k-Means clustering algorithm
 #
 # INPUT PARAMETERS:
 # ----------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # ----------------------------------------------------------------------------
 # X     String   ---    Location to read matrix X with the input data records
 # k     Int      ---    Number of centroids
 # runs  Int       10    Number of runs (with different initial centroids)
 # maxi  Int     1000    Maximum number of iterations per run
 # tol   Double 0.000001 Tolerance (epsilon) for WCSS change ratio
 # samp  Int       50    Average number of records per centroid in data samples
 # C     String  "C.mtx" Location to store the output matrix with the centroids
 # isY   Int        0    0 = do not write Y,  1 = write Y
 # Y     String  "Y.mtx" Location to store the mapping of records to centroids
 # fmt   String  "text"  Matrix output format, usually "text" or "csv"
 # verb  Int        0    0 = do not print per-iteration stats, 1 = print them
 # ----------------------------------------------------------------------------
 #
 # Example:
 # hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
 # hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1

 fileX = $X;
 fileY = ifdef ($Y, "Y.mtx");
 fileC = ifdef ($C, "C.mtx");

 num_centroids = $k;
 num_runs   = ifdef ($runs, 10);      # $runs=10;
 max_iter   = ifdef ($maxi, 1000);    # $maxi=1000;
 eps        = ifdef ($tol, 0.000001); # $tol=0.000001;
 is_write_Y = ifdef ($isY, 0);        # $isY=0;
 is_verbose = ifdef ($verb, 0);       # $verb=0;
 fmtCY      = ifdef ($fmt, "text");   # $fmt="text";
 avg_sample_size_per_centroid = ifdef ($samp, 50);  # $samp=50;


 print ("BEGIN K-MEANS SCRIPT");
 print ("Reading X...");

 # X : matrix of data points as rows
 X = read (fileX);
 num_records   = nrow (X);
 num_features  = ncol (X);

 sumXsq = sum (X ^ 2);
 # Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))

 # STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:

 print ("Taking data samples for initialization...");

 [sample_maps, samples_vs_runs_map, sample_block_size] =
     get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);

 is_row_in_samples = rowSums (sample_maps);
 X_samples = sample_maps %*% X;
 X_samples_sq_norms = rowSums (X_samples ^ 2);

 print ("Initializing the centroids for all runs...");
 All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);

 # We select centroids according to the k-Means++ heuristic applied to a sample of X
 # Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
 # with the out-of-range X_sample positions in min_distances set to 0.0

 min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at random

 for (i in 1 : num_centroids)
 {
     # "Matricize" and prefix-sum to compute the cumulative distribution function:
     min_distances_matrix_form =
         matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
     cdf_min_distances = cumsum (min_distances_matrix_form);

     # Select the i-th centroid in each sample as a random sample row id with
     # probability ~ min_distances:
     random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);
     threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
     centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;

     # Place the selected centroids together, one per run, into a matrix:
     centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
     centroid_placer_raw =
         table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
     centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
     centroids = centroid_placer %*% X_samples;

     # Place the selected centroids into their appropriate slots in All_Centroids:
     centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
     centroid_placer_raw =
         table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
     centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
     All_Centroids = All_Centroids + centroid_placer %*% centroids;

     # Update min_distances to preserve the loop invariant:
     distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
               - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
     if (i == 1) {
         min_distances = is_row_in_samples * distances;
     } else {
         min_distances = min (min_distances, distances);
 }   }

 # STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:

 termination_code = matrix (0, rows = num_runs, cols = 1);
 final_wcss = matrix (0, rows = num_runs, cols = 1);
 num_iterations = matrix (0, rows = num_runs, cols = 1);

 print ("Performing k-means iterations for all runs...");

 parfor (run_index in 1 : num_runs, check = 0)
 {
     C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
     C_old = C;
     iter_count = 0;
     term_code = 0;
     wcss = 0;

     while (term_code == 0)
     {
         # Compute Euclidean squared distances from records (X rows) to centroids (C rows)
         # without the C-independent term, then take the minimum for each record
         D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
         minD = rowMins (D);
         # Compute the current centroid-based within-cluster sum of squares (WCSS)
         wcss_old = wcss;
         wcss = sumXsq + sum (minD);
         if (is_verbose == 1) {
             if (iter_count == 0) {
                 print ("Run " + run_index + ", At Start-Up:  Centroid WCSS = " + wcss);
             } else {
                 print ("Run " + run_index + ", Iteration " + iter_count + ":  Centroid WCSS = " + wcss
                     + ";  Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
         }   }
         # Check if convergence or maximum iteration has been reached
         if (wcss_old - wcss < eps * wcss & iter_count > 0) {
             term_code = 1;  # Convergence is reached
         } else {
             if (iter_count >= max_iter) {
                 term_code = 2;  # Maximum iteration is reached
             } else {
                 iter_count = iter_count + 1;
                 # Find the closest centroid for each record
                 P = (D <= minD);
                 # If some records belong to multiple centroids, share them equally
                 P = P / rowSums (P);
                 # Compute the column normalization factor for P
                 P_denom = colSums (P);
                 if (sum (P_denom <= 0) > 0) {
                     term_code = 3;  # There is a "runaway" centroid with 0.0 denominator
                 } else {
                     C_old = C;
                     # Compute new centroids as weighted averages over the records
                     C = (t(P) %*% X) / t(P_denom);
     }   }   }   }
     print ("Run " + run_index + ", Iteration " + iter_count + ":  Terminated with code = " + term_code + ",  Centroid WCSS = " + wcss);
     All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
     final_wcss [run_index, 1] = wcss;
     termination_code [run_index, 1] = term_code;
     num_iterations [run_index, 1] = iter_count;
 }

 # STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:

 termination_bitmap = matrix (0, rows = num_runs, cols = 3);
 termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
 termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
 termination_stats = colSums (termination_bitmap);
 print ("Number of successful runs = " + as.integer (as.scalar (termination_stats [1, 1])));
 print ("Number of incomplete runs = " + as.integer (as.scalar (termination_stats [1, 2])));
 print ("Number of failed runs (with lost centroids) = " + as.integer (as.scalar (termination_stats [1, 3])));

 num_successful_runs = as.scalar (termination_stats [1, 1]);
 if (num_successful_runs > 0) {
     final_wcss_successful = final_wcss * termination_bitmap [, 1];
     worst_wcss = max (final_wcss_successful);
     best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
     avg_wcss = sum (final_wcss_successful) / num_successful_runs;
     best_index_vector = (final_wcss_successful == best_wcss);
     aggr_best_index_vector = cumsum (best_index_vector);
     best_index = as.integer (sum (aggr_best_index_vector == 0) + 1);
     print ("Successful runs:  Best run is " + best_index + " with Centroid WCSS = " + best_wcss
         + ";  Avg WCSS = " + avg_wcss + ";  Worst WCSS = " + worst_wcss);
     C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
     print ("Writing out the best-WCSS centroids...");
     write (C, fileC, format=fmtCY);
     if (is_write_Y == 1) {
         print ("Writing out the best-WCSS cluster labels...");
         D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
         P = (D <= rowMins (D));
         aggr_P = t(cumsum (t(P)));
         Y = rowSums (aggr_P == 0) + 1
         write (Y, fileY, format=fmtCY);
     }
     print ("DONE.");
 } else {
     stop ("No output is produced.  Try increasing the number of iterations and/or runs.");
 }


 get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
     return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
 {
     if (approx_sample_size < num_records) {
         # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
         # to get the sample block size (to allocate space):
         sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
         num_rows = sample_block_size * num_samples;

         # Generate all samples in parallel by converting uniform random values into random
         # integer skip-ahead intervals and prefix-summing them:
         sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
         sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
         # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
         sample_rec_ids = cumsum (sample_rec_ids);  #  (skip to next one) --> (skip to i-th one)

         # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
         is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
         sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
                        + (num_records + 1) * (1 - is_sample_rec_id_within_range);

         # Rearrange all samples (and their out-of-range indicators) into one column-vector:
         sample_rec_ids =
             matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
         is_row_in_samples =
             matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);

         # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
         # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
         sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
         max_rec_id = ncol (sample_maps_raw);
         if (max_rec_id >= num_records) {
             sample_maps = sample_maps_raw [, 1 : num_records];
         } else {
             sample_maps = matrix (0, rows = num_rows, cols = num_records);
             sample_maps [, 1 : max_rec_id] = sample_maps_raw;
         }

         # Create a 0-1-matrix that maps each sample column ID into all row positions of the
         # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
         sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
         # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
         col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
         sample_col_map = table (sample_positions, col_positions);
         # Remove the out-of-sample-range positions by cutting off the last row:
         sample_col_map = sample_col_map [1 : (num_rows), ];

     } else {
         one_per_record = matrix (1, rows = num_records, cols = 1);
         sample_block_size = num_records;
         sample_maps    = matrix (0, rows = (num_records * num_samples), cols = num_records);
         sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
         for (i in 1:num_samples) {
             sample_maps    [(num_records * (i - 1) + 1) : (num_records * i),  ] = diag (one_per_record);
             sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
 }   }   }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# Implements the k-Means clustering algorithm
	#
	# INPUT PARAMETERS:
	# ----------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ----------------------------------------------------------------------------
	# X String --- Location to read matrix X with the input data records
	# k Int --- Number of centroids
	# runs Int 10 Number of runs (with different initial centroids)
	# maxi Int 1000 Maximum number of iterations per run
	# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio
	# samp Int 50 Average number of records per centroid in data samples
	# C String "C.mtx" Location to store the output matrix with the centroids
	# isY Int 0 0 = do not write Y, 1 = write Y
	# Y String "Y.mtx" Location to store the mapping of records to centroids
	# fmt String "text" Matrix output format, usually "text" or "csv"
	# verb Int 0 0 = do not print per-iteration stats, 1 = print them
	# ----------------------------------------------------------------------------
	#
	# Example:
	# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
	# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1

	fileX = $X;
	fileY = ifdef ($Y, "Y.mtx");
	fileC = ifdef ($C, "C.mtx");

	num_centroids = $k;
	num_runs = ifdef ($runs, 10); # $runs=10;
	max_iter = ifdef ($maxi, 1000); # $maxi=1000;
	eps = ifdef ($tol, 0.000001); # $tol=0.000001;
	is_write_Y = ifdef ($isY, 0); # $isY=0;
	is_verbose = ifdef ($verb, 0); # $verb=0;
	fmtCY = ifdef ($fmt, "text"); # $fmt="text";
	avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50;


	print ("BEGIN K-MEANS SCRIPT");
	print ("Reading X...");

	# X : matrix of data points as rows
	X = read (fileX);
	num_records = nrow (X);
	num_features = ncol (X);

	sumXsq = sum (X ^ 2);
	# Remark - A useful rewrite: sum (A %% B) = sum (t(colSums(A)) rowSums(B))

	# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:

	print ("Taking data samples for initialization...");

	[sample_maps, samples_vs_runs_map, sample_block_size] =
	get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);

	is_row_in_samples = rowSums (sample_maps);
	X_samples = sample_maps %*% X;
	X_samples_sq_norms = rowSums (X_samples ^ 2);

	print ("Initializing the centroids for all runs...");
	All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);

	# We select centroids according to the k-Means++ heuristic applied to a sample of X
	# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
	# with the out-of-range X_sample positions in min_distances set to 0.0

	min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random

	for (i in 1 : num_centroids)
	{
	# "Matricize" and prefix-sum to compute the cumulative distribution function:
	min_distances_matrix_form =
	matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
	cdf_min_distances = cumsum (min_distances_matrix_form);

	# Select the i-th centroid in each sample as a random sample row id with
	# probability ~ min_distances:
	random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);
	threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
	centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;

	# Place the selected centroids together, one per run, into a matrix:
	centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
	centroid_placer_raw =
	table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
	centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
	centroids = centroid_placer %*% X_samples;

	# Place the selected centroids into their appropriate slots in All_Centroids:
	centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
	centroid_placer_raw =
	table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
	centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
	All_Centroids = All_Centroids + centroid_placer %*% centroids;

	# Update min_distances to preserve the loop invariant:
	distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
	- 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
	if (i == 1) {
	min_distances = is_row_in_samples * distances;
	} else {
	min_distances = min (min_distances, distances);
	} }

	# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:

	termination_code = matrix (0, rows = num_runs, cols = 1);
	final_wcss = matrix (0, rows = num_runs, cols = 1);
	num_iterations = matrix (0, rows = num_runs, cols = 1);

	print ("Performing k-means iterations for all runs...");

	parfor (run_index in 1 : num_runs, check = 0)
	{
	C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
	C_old = C;
	iter_count = 0;
	term_code = 0;
	wcss = 0;

	while (term_code == 0)
	{
	# Compute Euclidean squared distances from records (X rows) to centroids (C rows)
	# without the C-independent term, then take the minimum for each record
	D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
	minD = rowMins (D);
	# Compute the current centroid-based within-cluster sum of squares (WCSS)
	wcss_old = wcss;
	wcss = sumXsq + sum (minD);
	if (is_verbose == 1) {
	if (iter_count == 0) {
	print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss);
	} else {
	print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss
	+ "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
	} }
	# Check if convergence or maximum iteration has been reached
	if (wcss_old - wcss < eps * wcss & iter_count > 0) {
	term_code = 1; # Convergence is reached
	} else {
	if (iter_count >= max_iter) {
	term_code = 2; # Maximum iteration is reached
	} else {
	iter_count = iter_count + 1;
	# Find the closest centroid for each record
	P = (D <= minD);
	# If some records belong to multiple centroids, share them equally
	P = P / rowSums (P);
	# Compute the column normalization factor for P
	P_denom = colSums (P);
	if (sum (P_denom <= 0) > 0) {
	term_code = 3; # There is a "runaway" centroid with 0.0 denominator
	} else {
	C_old = C;
	# Compute new centroids as weighted averages over the records
	C = (t(P) %*% X) / t(P_denom);
	} } } }
	print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss);
	All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
	final_wcss [run_index, 1] = wcss;
	termination_code [run_index, 1] = term_code;
	num_iterations [run_index, 1] = iter_count;
	}

	# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:

	termination_bitmap = matrix (0, rows = num_runs, cols = 3);
	termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
	termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
	termination_stats = colSums (termination_bitmap);
	print ("Number of successful runs = " + as.integer (as.scalar (termination_stats [1, 1])));
	print ("Number of incomplete runs = " + as.integer (as.scalar (termination_stats [1, 2])));
	print ("Number of failed runs (with lost centroids) = " + as.integer (as.scalar (termination_stats [1, 3])));

	num_successful_runs = as.scalar (termination_stats [1, 1]);
	if (num_successful_runs > 0) {
	final_wcss_successful = final_wcss * termination_bitmap [, 1];
	worst_wcss = max (final_wcss_successful);
	best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
	avg_wcss = sum (final_wcss_successful) / num_successful_runs;
	best_index_vector = (final_wcss_successful == best_wcss);
	aggr_best_index_vector = cumsum (best_index_vector);
	best_index = as.integer (sum (aggr_best_index_vector == 0) + 1);
	print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss
	+ "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss);
	C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
	print ("Writing out the best-WCSS centroids...");
	write (C, fileC, format=fmtCY);
	if (is_write_Y == 1) {
	print ("Writing out the best-WCSS cluster labels...");
	D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
	P = (D <= rowMins (D));
	aggr_P = t(cumsum (t(P)));
	Y = rowSums (aggr_P == 0) + 1
	write (Y, fileY, format=fmtCY);
	}
	print ("DONE.");
	} else {
	stop ("No output is produced. Try increasing the number of iterations and/or runs.");
	}



	get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
	return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
	{
	if (approx_sample_size < num_records) {
	# Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
	# to get the sample block size (to allocate space):
	sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
	num_rows = sample_block_size * num_samples;

	# Generate all samples in parallel by converting uniform random values into random
	# integer skip-ahead intervals and prefix-summing them:
	sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
	sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
	# Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
	sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one)

	# Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
	is_sample_rec_id_within_range = (sample_rec_ids <= num_records);
	sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
	+ (num_records + 1) * (1 - is_sample_rec_id_within_range);

	# Rearrange all samples (and their out-of-range indicators) into one column-vector:
	sample_rec_ids =
	matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
	is_row_in_samples =
	matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);

	# Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
	# of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
	sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
	max_rec_id = ncol (sample_maps_raw);
	if (max_rec_id >= num_records) {
	sample_maps = sample_maps_raw [, 1 : num_records];
	} else {
	sample_maps = matrix (0, rows = num_rows, cols = num_records);
	sample_maps [, 1 : max_rec_id] = sample_maps_raw;
	}

	# Create a 0-1-matrix that maps each sample column ID into all row positions of the
	# corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
	sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
	# Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
	col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
	sample_col_map = table (sample_positions, col_positions);
	# Remove the out-of-sample-range positions by cutting off the last row:
	sample_col_map = sample_col_map [1 : (num_rows), ];

	} else {
	one_per_record = matrix (1, rows = num_records, cols = 1);
	sample_block_size = num_records;
	sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records);
	sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
	for (i in 1:num_samples) {
	sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record);
	sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
	} } }