scripts/algorithms/Kmeans-predict.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # Compares two categorical data vectors (presumed to be clusterings) and
 # counts matching/nonmatching same-cluster/different-cluster pairs of rows
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------
 # spY   String  " "     Location to read a column-vector with the "specified"
 #                       assignment of records (rows) to categories (clusters)
 # prY   String  " "     Location to read (or write, if X and C are present) a
 #                       column-vector with the "predicted" assignment of rows
 #                       to clusters.  NOTE: The same category may be labeled
 #                       differently in each of the two vectors, spY and prY.
 # fmt   String "text"   Matrix output format for prY, usually "text" or "csv"
 # X     String  " "     Location to read matrix X with the input data records
 # C     String  " "     Location to read matrix C with the cluster centroids
 #                       NOTE: If X and C are present, prY is an output file.
 # O     String  " "     Location to write the printed output statistics
 # ---------------------------------------------------------------------------
 #
 # The "O" file provides the output statistics in CSV format, one per line, in
 # the following format: NAME, [CID], VALUE.  Note:
 #   - The 1st group statistics are given if X input is available;
 #   - The 2nd group statistics are given if X and C inputs are available;
 #   - The 3rd and 4th group statistics are given if spY input is available;
 #   - Only the 4th group statistics contain a nonempty CID value;
 #   - When present, CID contains either the specified category label or the
 #     predicted cluster label.
 #
 # NAME            CID   MEANING
 # ---------------------------------------------------------------------------
 # TSS                   Total Sum of Squares (from the total mean)
 # WCSS_M                Within-Cluster  Sum of Squares (means as centers)
 # WCSS_M_PC             Within-Cluster  Sum of Squares (means), in % of TSS
 # BCSS_M                Between-Cluster Sum of Squares (means as centers)
 # BCSS_M_PC             Between-Cluster Sum of Squares (means), in % of TSS
 #
 # WCSS_C                Within-Cluster  Sum of Squares (centroids as centers)
 # WCSS_C_PC             Within-Cluster  Sum of Squares (centroids), % of TSS
 # BCSS_C                Between-Cluster Sum of Squares (centroids as centers)
 # BCSS_C_PC             Between-Cluster Sum of Squares (centroids), % of TSS
 #
 # TRUE_SAME_CT          Same-category pairs predicted as Same-cluster, count
 # TRUE_SAME_PC          Same-category pairs predicted as Same-cluster, %
 # TRUE_DIFF_CT          Diff-category pairs predicted as Diff-cluster, count
 # TRUE_DIFF_PC          Diff-category pairs predicted as Diff-cluster, %
 # FALSE_SAME_CT         Diff-category pairs predicted as Same-cluster, count
 # FALSE_SAME_PC         Diff-category pairs predicted as Same-cluster, %
 # FALSE_DIFF_CT         Same-category pairs predicted as Diff-cluster, count
 # FALSE_DIFF_PC         Same-category pairs predicted as Diff-cluster, %
 #
 # SPEC_TO_PRED     +    For specified category, the best predicted cluster id
 # SPEC_FULL_CT     +    For specified category, its full count
 # SPEC_MATCH_CT    +    For specified category, best-cluster matching count
 # SPEC_MATCH_PC    +    For specified category, % of matching to full count
 # PRED_TO_SPEC     +    For predicted cluster, the best specified category id
 # PRED_FULL_CT     +    For predicted cluster, its full count
 # PRED_MATCH_CT    +    For predicted cluster, best-category matching count
 # PRED_MATCH_PC    +    For predicted cluster, % of matching to full count
 # ---------------------------------------------------------------------------
 #
 # Examples:
 # 1. To predict Y given X and C:
 #     hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
 #         C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
 # 2. To compare "actual" labels spY with "predicted" labels given X and C:
 #     hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
 #         C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
 # 3. To compare "actual" labels spY with given "predicted" labels prY:
 #     hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
 #         prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats


 fmt_prY = ifdef ($fmt, "text");
 filePrY = ifdef ($prY, " ");
 fileSpY = ifdef ($spY, " ");
 fileX   = ifdef ($X, " ");
 fileC   = ifdef ($C, " ");
 fileO   = ifdef ($O, " ");

 is_str_empty = TRUE;
 str = " ";

 print ("BEGIN K-MEANS SCORING SCRIPT");

 if (fileX != " ") {
     print ("Reading X...");
     X = read (fileX);
     total_mean = colSums (X) / nrow (X);
     total_ss = sum( (X - total_mean)^2 );
 }

 if ((fileC != " ") & (fileX == " ")) {
     print ("ERROR: Cannot provide C without providing X.");
 } else {


 if (fileC != " ") {
     print ("Reading C...");
     C = read (fileC);
     num_clusters = nrow (C);
     print ("Computing the predicted Y...");
     D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
     prY = rowIndexMin (D);
     if (filePrY != " ") {
         print ("Writing the predicted Y...");
         write (prY, filePrY, format=fmt_prY);
     }
 } else {
     print ("Reading the predicted Y...");
     prY = read (filePrY);
     num_clusters = max (prY);
 }

 if (fileX != " ") {
     print ("Computing the WCSS...");
     # Compute projection matrix from clusters to records
     P = table (seq (1, nrow (X), 1), prY, nrow(X), num_clusters);
     # Compute the means, as opposed to the centroids
     cluster_sizes = t(colSums (P));
     M = (t(P) %*% X) / (cluster_sizes + (cluster_sizes == 0));
     # Compute the WCSS for the means
     wcss_means = sum ((X - P %*% M) ^ 2);
     wcss_means_pc = 100.0 * wcss_means / total_ss;
     bcss_means = sum (cluster_sizes * rowSums ((M - total_mean) ^ 2));
     bcss_means_pc = 100.0 * bcss_means / total_ss;
     # Output results
     print ("Total Sum of Squares (TSS) = " + total_ss);
     print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
     print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
     str = "TSS,," + total_ss;
     str = append (str, "WCSS_M,," + wcss_means);
     str = append (str, "WCSS_M_PC,," + wcss_means_pc);
     str = append (str, "BCSS_M,," + bcss_means);
     str = append (str, "BCSS_M_PC,," + bcss_means_pc);
     is_str_empty = FALSE;
 }

 if (fileC != " ") {
     # Compute the WCSS for the centroids
     wcss_centroids = sum ((X - P %*% C) ^ 2);
     wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
     bcss_centroids = sum (cluster_sizes * rowSums ((C - total_mean) ^ 2));
     bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
     # Output results
     print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
     print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
     str = append (str, "WCSS_C,," + wcss_centroids);
     str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
     str = append (str, "BCSS_C,," + bcss_centroids);
     str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
 }


 if (fileSpY != " ") {

 print ("Reading the specified Y...");
 spY = read (fileSpY);
 num_records = nrow (spY);

 if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
     print ("ERROR: spY and/or prY size mismatch");
     print ("nrow (spY) = " + nrow (spY) + ";  ncol (spY) = " + ncol (spY)
       + ";  nrow (prY) = " + nrow (prY) + ";  ncol (prY) = " + ncol (prY));
 } else {

     print ("Computing the pairs counts...");

     orig_min_spY = min (spY);
     orig_min_prY = min (prY);
     spY = spY + (1 - orig_min_spY);
     prY = prY + (1 - orig_min_prY);

     spYprY_row_counts = table (spY, prY);
     spY_row_counts = rowSums (spYprY_row_counts);
     prY_row_counts = t(colSums (spYprY_row_counts));

     # Count all pairs of rows having the same (spY, prY)-values
     spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;

     # Count all pairs of rows having the same spY-values
     spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
     # Count all pairs of rows having the same prY-values
     prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;

     num_pairs = num_records * (num_records - 1.0) / 2.0;

     num_TP_pairs = sum (spYprY_pair_counts);
     num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
     num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
     num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;

     pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
     pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
     pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
     pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;

     if (is_str_empty) {
         str = "TRUE_SAME_CT,," + num_TP_pairs;
         is_str_empty = FALSE;
     } else {
         str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
     }
     str = append (str, "TRUE_SAME_PC,,"  + pct_TP_pairs);
     str = append (str, "TRUE_DIFF_CT,,"  + num_TN_pairs);
     str = append (str, "TRUE_DIFF_PC,,"  + pct_TN_pairs);
     str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
     str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
     str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
     str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);

     pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
     pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
     pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
     pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;

     space_TP = "";  if (pct_TP_pairs < 100) {space_TP = " ";}  if (pct_TP_pairs < 10) {space_TP = "  ";}
     space_TN = "";  if (pct_TN_pairs < 100) {space_TN = " ";}  if (pct_TN_pairs < 10) {space_TN = "  ";}
     space_FP = "";  if (pct_FP_pairs < 100) {space_FP = " ";}  if (pct_FP_pairs < 10) {space_FP = "  ";}
     space_FN = "";  if (pct_FN_pairs < 100) {space_FN = " ";}  if (pct_FN_pairs < 10) {space_FN = "  ";}

     print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
         + pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
     print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
         + pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
     print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
         + pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
     print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
         + pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");

     [spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
         get_best_assignments (spYprY_row_counts);

     print (" ");
     print ("Specified Categories versus Predicted Clusters:");

     spY_cids = spY_cids + orig_min_spY - 1;
     prY_cids = prY_cids + orig_min_prY - 1;

     for (i in 1 : nrow (spY_cids))
     {
         cid = as.integer (as.scalar (spY_cids [i, 1]));
         pct = as.scalar (rounded_percentages [i, 1]);
         space_pct = "";  if (pct < 100) {space_pct = " ";}  if (pct < 10) {space_pct = "  ";}
         print ("Category " + cid +
             ":  best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) +
             ";  full count = " + as.integer (as.scalar (full_counts [i, 1])) +
             ",  matching count = " + space_pct + pct + "% (" +
             as.integer (as.scalar (matching_counts [i, 1])) + ")");

         str = append (str, "SPEC_TO_PRED,"  + cid + "," + as.scalar (prY_cids [i, 1]));
         str = append (str, "SPEC_FULL_CT,"  + cid + "," + as.scalar (full_counts [i, 1]));
         str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
         str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
     }

     [prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
         get_best_assignments (t(spYprY_row_counts));

     print (" ");
     print ("Predicted Clusters versus Specified Categories:");

     prY_cids = prY_cids + orig_min_prY - 1;
     spY_cids = spY_cids + orig_min_spY - 1;

     for (i in 1 : nrow (prY_cids))
     {
         cid = as.integer (as.scalar (prY_cids [i, 1]));
         pct = as.scalar (rounded_percentages [i, 1]);
         space_pct = "";  if (pct < 100) {space_pct = " ";}  if (pct < 10) {space_pct = "  ";}
         print ("Cluster " + cid +
             ":  best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) +
             ";  full count = " + as.integer (as.scalar (full_counts [i, 1])) +
             ",  matching count = " + space_pct + pct + "% (" +
             as.integer (as.scalar (matching_counts [i, 1])) + ")");

         str = append (str, "PRED_TO_SPEC,"  + cid + "," + as.scalar (spY_cids [i, 1]));
         str = append (str, "PRED_FULL_CT,"  + cid + "," + as.scalar (full_counts [i, 1]));
         str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
         str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
     }

     print (" ");
 }}}

 if ((fileO != " ") & (! is_str_empty)) {
     write (str, fileO);
 }

 print ("DONE: K-MEANS SCORING SCRIPT");


 get_best_assignments = function (Matrix[double] counts)
 return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
         Matrix[double] max_counts, Matrix[double] rounded_percentages)
 {
     margins = rowSums (counts);
     select_positive = removeEmpty (target = diag (margins > 0), margin = "rows");
     row_ids = select_positive %*% seq (1, nrow (margins), 1);
     pos_counts = select_positive %*% counts;
     pos_margins = select_positive %*% margins;
     max_counts = rowMaxs (pos_counts);
     is_max_count = (pos_counts == max_counts);
     aggr_is_max_count = t(cumsum (t(is_max_count)));
     col_ids = rowSums (aggr_is_max_count == 0) + 1;
     rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# Compares two categorical data vectors (presumed to be clusterings) and
	# counts matching/nonmatching same-cluster/different-cluster pairs of rows
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------
	# spY String " " Location to read a column-vector with the "specified"
	# assignment of records (rows) to categories (clusters)
	# prY String " " Location to read (or write, if X and C are present) a
	# column-vector with the "predicted" assignment of rows
	# to clusters. NOTE: The same category may be labeled
	# differently in each of the two vectors, spY and prY.
	# fmt String "text" Matrix output format for prY, usually "text" or "csv"
	# X String " " Location to read matrix X with the input data records
	# C String " " Location to read matrix C with the cluster centroids
	# NOTE: If X and C are present, prY is an output file.
	# O String " " Location to write the printed output statistics
	# ---------------------------------------------------------------------------
	#
	# The "O" file provides the output statistics in CSV format, one per line, in
	# the following format: NAME, [CID], VALUE. Note:
	# - The 1st group statistics are given if X input is available;
	# - The 2nd group statistics are given if X and C inputs are available;
	# - The 3rd and 4th group statistics are given if spY input is available;
	# - Only the 4th group statistics contain a nonempty CID value;
	# - When present, CID contains either the specified category label or the
	# predicted cluster label.
	#
	# NAME CID MEANING
	# ---------------------------------------------------------------------------
	# TSS Total Sum of Squares (from the total mean)
	# WCSS_M Within-Cluster Sum of Squares (means as centers)
	# WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS
	# BCSS_M Between-Cluster Sum of Squares (means as centers)
	# BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS
	#
	# WCSS_C Within-Cluster Sum of Squares (centroids as centers)
	# WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS
	# BCSS_C Between-Cluster Sum of Squares (centroids as centers)
	# BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS
	#
	# TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count
	# TRUE_SAME_PC Same-category pairs predicted as Same-cluster, %
	# TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count
	# TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, %
	# FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count
	# FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, %
	# FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count
	# FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, %
	#
	# SPEC_TO_PRED + For specified category, the best predicted cluster id
	# SPEC_FULL_CT + For specified category, its full count
	# SPEC_MATCH_CT + For specified category, best-cluster matching count
	# SPEC_MATCH_PC + For specified category, % of matching to full count
	# PRED_TO_SPEC + For predicted cluster, the best specified category id
	# PRED_FULL_CT + For predicted cluster, its full count
	# PRED_MATCH_CT + For predicted cluster, best-category matching count
	# PRED_MATCH_PC + For predicted cluster, % of matching to full count
	# ---------------------------------------------------------------------------
	#
	# Examples:
	# 1. To predict Y given X and C:
	# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
	# C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
	# 2. To compare "actual" labels spY with "predicted" labels given X and C:
	# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
	# C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
	# 3. To compare "actual" labels spY with given "predicted" labels prY:
	# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
	# prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats


	fmt_prY = ifdef ($fmt, "text");
	filePrY = ifdef ($prY, " ");
	fileSpY = ifdef ($spY, " ");
	fileX = ifdef ($X, " ");
	fileC = ifdef ($C, " ");
	fileO = ifdef ($O, " ");

	is_str_empty = TRUE;
	str = " ";

	print ("BEGIN K-MEANS SCORING SCRIPT");

	if (fileX != " ") {
	print ("Reading X...");
	X = read (fileX);
	total_mean = colSums (X) / nrow (X);
	total_ss = sum( (X - total_mean)^2 );
	}

	if ((fileC != " ") & (fileX == " ")) {
	print ("ERROR: Cannot provide C without providing X.");
	} else {


	if (fileC != " ") {
	print ("Reading C...");
	C = read (fileC);
	num_clusters = nrow (C);
	print ("Computing the predicted Y...");
	D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
	prY = rowIndexMin (D);
	if (filePrY != " ") {
	print ("Writing the predicted Y...");
	write (prY, filePrY, format=fmt_prY);
	}
	} else {
	print ("Reading the predicted Y...");
	prY = read (filePrY);
	num_clusters = max (prY);
	}

	if (fileX != " ") {
	print ("Computing the WCSS...");
	# Compute projection matrix from clusters to records
	P = table (seq (1, nrow (X), 1), prY, nrow(X), num_clusters);
	# Compute the means, as opposed to the centroids
	cluster_sizes = t(colSums (P));
	M = (t(P) %*% X) / (cluster_sizes + (cluster_sizes == 0));
	# Compute the WCSS for the means
	wcss_means = sum ((X - P %*% M) ^ 2);
	wcss_means_pc = 100.0 * wcss_means / total_ss;
	bcss_means = sum (cluster_sizes * rowSums ((M - total_mean) ^ 2));
	bcss_means_pc = 100.0 * bcss_means / total_ss;
	# Output results
	print ("Total Sum of Squares (TSS) = " + total_ss);
	print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
	print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
	str = "TSS,," + total_ss;
	str = append (str, "WCSS_M,," + wcss_means);
	str = append (str, "WCSS_M_PC,," + wcss_means_pc);
	str = append (str, "BCSS_M,," + bcss_means);
	str = append (str, "BCSS_M_PC,," + bcss_means_pc);
	is_str_empty = FALSE;
	}

	if (fileC != " ") {
	# Compute the WCSS for the centroids
	wcss_centroids = sum ((X - P %*% C) ^ 2);
	wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
	bcss_centroids = sum (cluster_sizes * rowSums ((C - total_mean) ^ 2));
	bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
	# Output results
	print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
	print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
	str = append (str, "WCSS_C,," + wcss_centroids);
	str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
	str = append (str, "BCSS_C,," + bcss_centroids);
	str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
	}



	if (fileSpY != " ") {

	print ("Reading the specified Y...");
	spY = read (fileSpY);
	num_records = nrow (spY);

	if (num_records != nrow (prY) \| ncol (spY) != 1 \| ncol (prY) != 1) {
	print ("ERROR: spY and/or prY size mismatch");
	print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY)
	+ "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY));
	} else {

	print ("Computing the pairs counts...");

	orig_min_spY = min (spY);
	orig_min_prY = min (prY);
	spY = spY + (1 - orig_min_spY);
	prY = prY + (1 - orig_min_prY);

	spYprY_row_counts = table (spY, prY);
	spY_row_counts = rowSums (spYprY_row_counts);
	prY_row_counts = t(colSums (spYprY_row_counts));

	# Count all pairs of rows having the same (spY, prY)-values
	spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;

	# Count all pairs of rows having the same spY-values
	spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
	# Count all pairs of rows having the same prY-values
	prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;

	num_pairs = num_records * (num_records - 1.0) / 2.0;

	num_TP_pairs = sum (spYprY_pair_counts);
	num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
	num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
	num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;

	pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
	pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
	pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
	pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;

	if (is_str_empty) {
	str = "TRUE_SAME_CT,," + num_TP_pairs;
	is_str_empty = FALSE;
	} else {
	str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
	}
	str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs);
	str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs);
	str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs);
	str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
	str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
	str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
	str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);

	pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
	pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
	pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
	pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;

	space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";}
	space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";}
	space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";}
	space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";}

	print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
	+ pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
	print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
	+ pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
	print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
	+ pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
	print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
	+ pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");

	[spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
	get_best_assignments (spYprY_row_counts);

	print (" ");
	print ("Specified Categories versus Predicted Clusters:");

	spY_cids = spY_cids + orig_min_spY - 1;
	prY_cids = prY_cids + orig_min_prY - 1;

	for (i in 1 : nrow (spY_cids))
	{
	cid = as.integer (as.scalar (spY_cids [i, 1]));
	pct = as.scalar (rounded_percentages [i, 1]);
	space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
	print ("Category " + cid +
	": best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) +
	"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
	", matching count = " + space_pct + pct + "% (" +
	as.integer (as.scalar (matching_counts [i, 1])) + ")");

	str = append (str, "SPEC_TO_PRED," + cid + "," + as.scalar (prY_cids [i, 1]));
	str = append (str, "SPEC_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
	str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
	str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
	}

	[prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
	get_best_assignments (t(spYprY_row_counts));

	print (" ");
	print ("Predicted Clusters versus Specified Categories:");

	prY_cids = prY_cids + orig_min_prY - 1;
	spY_cids = spY_cids + orig_min_spY - 1;

	for (i in 1 : nrow (prY_cids))
	{
	cid = as.integer (as.scalar (prY_cids [i, 1]));
	pct = as.scalar (rounded_percentages [i, 1]);
	space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
	print ("Cluster " + cid +
	": best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) +
	"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
	", matching count = " + space_pct + pct + "% (" +
	as.integer (as.scalar (matching_counts [i, 1])) + ")");

	str = append (str, "PRED_TO_SPEC," + cid + "," + as.scalar (spY_cids [i, 1]));
	str = append (str, "PRED_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
	str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
	str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
	}

	print (" ");
	}}}

	if ((fileO != " ") & (! is_str_empty)) {
	write (str, fileO);
	}

	print ("DONE: K-MEANS SCORING SCRIPT");



	get_best_assignments = function (Matrix[double] counts)
	return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
	Matrix[double] max_counts, Matrix[double] rounded_percentages)
	{
	margins = rowSums (counts);
	select_positive = removeEmpty (target = diag (margins > 0), margin = "rows");
	row_ids = select_positive %*% seq (1, nrow (margins), 1);
	pos_counts = select_positive %*% counts;
	pos_margins = select_positive %*% margins;
	max_counts = rowMaxs (pos_counts);
	is_max_count = (pos_counts == max_counts);
	aggr_is_max_count = t(cumsum (t(is_max_count)));
	col_ids = rowSums (aggr_is_max_count == 0) + 1;
	rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
	}