| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # Compares two categorical data vectors (presumed to be clusterings) and |
| # counts matching/nonmatching same-cluster/different-cluster pairs of rows |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------- |
| # spY String " " Location to read a column-vector with the "specified" |
| # assignment of records (rows) to categories (clusters) |
| # prY String " " Location to read (or write, if X and C are present) a |
| # column-vector with the "predicted" assignment of rows |
| # to clusters. NOTE: The same category may be labeled |
| # differently in each of the two vectors, spY and prY. |
| # fmt String "text" Matrix output format for prY, usually "text" or "csv" |
| # X String " " Location to read matrix X with the input data records |
| # C String " " Location to read matrix C with the cluster centroids |
| # NOTE: If X and C are present, prY is an output file. |
| # O String " " Location to write the printed output statistics |
| # --------------------------------------------------------------------------- |
| # |
| # The "O" file provides the output statistics in CSV format, one per line, in |
| # the following format: NAME, [CID], VALUE. Note: |
| # - The 1st group statistics are given if X input is available; |
| # - The 2nd group statistics are given if X and C inputs are available; |
| # - The 3rd and 4th group statistics are given if spY input is available; |
| # - Only the 4th group statistics contain a nonempty CID value; |
| # - When present, CID contains either the specified category label or the |
| # predicted cluster label. |
| # |
| # NAME CID MEANING |
| # --------------------------------------------------------------------------- |
| # TSS Total Sum of Squares (from the total mean) |
| # WCSS_M Within-Cluster Sum of Squares (means as centers) |
| # WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS |
| # BCSS_M Between-Cluster Sum of Squares (means as centers) |
| # BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS |
| # |
| # WCSS_C Within-Cluster Sum of Squares (centroids as centers) |
| # WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS |
| # BCSS_C Between-Cluster Sum of Squares (centroids as centers) |
| # BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS |
| # |
| # TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count |
| # TRUE_SAME_PC Same-category pairs predicted as Same-cluster, % |
| # TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count |
| # TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, % |
| # FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count |
| # FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, % |
| # FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count |
| # FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, % |
| # |
| # SPEC_TO_PRED + For specified category, the best predicted cluster id |
| # SPEC_FULL_CT + For specified category, its full count |
| # SPEC_MATCH_CT + For specified category, best-cluster matching count |
| # SPEC_MATCH_PC + For specified category, % of matching to full count |
| # PRED_TO_SPEC + For predicted cluster, the best specified category id |
| # PRED_FULL_CT + For predicted cluster, its full count |
| # PRED_MATCH_CT + For predicted cluster, best-category matching count |
| # PRED_MATCH_PC + For predicted cluster, % of matching to full count |
| # --------------------------------------------------------------------------- |
| # |
| # Examples: |
| # 1. To predict Y given X and C: |
| # hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X |
| # C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats |
| # 2. To compare "actual" labels spY with "predicted" labels given X and C: |
| # hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X |
| # C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats |
| # 3. To compare "actual" labels spY with given "predicted" labels prY: |
| # hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y |
| # prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats |
| |
| |
| fmt_prY = ifdef ($fmt, "text"); |
| filePrY = ifdef ($prY, " "); |
| fileSpY = ifdef ($spY, " "); |
| fileX = ifdef ($X, " "); |
| fileC = ifdef ($C, " "); |
| fileO = ifdef ($O, " "); |
| |
| is_str_empty = TRUE; |
| str = " "; |
| |
| print ("BEGIN K-MEANS SCORING SCRIPT"); |
| |
| if (fileX != " ") { |
| print ("Reading X..."); |
| X = read (fileX); |
| total_mean = colSums (X) / nrow (X); |
| total_ss = sum( (X - total_mean)^2 ); |
| } |
| |
| if ((fileC != " ") & (fileX == " ")) { |
| print ("ERROR: Cannot provide C without providing X."); |
| } else { |
| |
| |
| if (fileC != " ") { |
| print ("Reading C..."); |
| C = read (fileC); |
| num_clusters = nrow (C); |
| print ("Computing the predicted Y..."); |
| D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); |
| prY = rowIndexMin (D); |
| if (filePrY != " ") { |
| print ("Writing the predicted Y..."); |
| write (prY, filePrY, format=fmt_prY); |
| } |
| } else { |
| print ("Reading the predicted Y..."); |
| prY = read (filePrY); |
| num_clusters = max (prY); |
| } |
| |
| if (fileX != " ") { |
| print ("Computing the WCSS..."); |
| # Compute projection matrix from clusters to records |
| P = table (seq (1, nrow (X), 1), prY, nrow(X), num_clusters); |
| # Compute the means, as opposed to the centroids |
| cluster_sizes = t(colSums (P)); |
| M = (t(P) %*% X) / (cluster_sizes + (cluster_sizes == 0)); |
| # Compute the WCSS for the means |
| wcss_means = sum ((X - P %*% M) ^ 2); |
| wcss_means_pc = 100.0 * wcss_means / total_ss; |
| bcss_means = sum (cluster_sizes * rowSums ((M - total_mean) ^ 2)); |
| bcss_means_pc = 100.0 * bcss_means / total_ss; |
| # Output results |
| print ("Total Sum of Squares (TSS) = " + total_ss); |
| print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means); |
| print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means); |
| str = "TSS,," + total_ss; |
| str = append (str, "WCSS_M,," + wcss_means); |
| str = append (str, "WCSS_M_PC,," + wcss_means_pc); |
| str = append (str, "BCSS_M,," + bcss_means); |
| str = append (str, "BCSS_M_PC,," + bcss_means_pc); |
| is_str_empty = FALSE; |
| } |
| |
| if (fileC != " ") { |
| # Compute the WCSS for the centroids |
| wcss_centroids = sum ((X - P %*% C) ^ 2); |
| wcss_centroids_pc = 100.0 * wcss_centroids / total_ss; |
| bcss_centroids = sum (cluster_sizes * rowSums ((C - total_mean) ^ 2)); |
| bcss_centroids_pc = 100.0 * bcss_centroids / total_ss; |
| # Output results |
| print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids); |
| print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids); |
| str = append (str, "WCSS_C,," + wcss_centroids); |
| str = append (str, "WCSS_C_PC,," + wcss_centroids_pc); |
| str = append (str, "BCSS_C,," + bcss_centroids); |
| str = append (str, "BCSS_C_PC,," + bcss_centroids_pc); |
| } |
| |
| |
| |
| if (fileSpY != " ") { |
| |
| print ("Reading the specified Y..."); |
| spY = read (fileSpY); |
| num_records = nrow (spY); |
| |
| if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) { |
| print ("ERROR: spY and/or prY size mismatch"); |
| print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY) |
| + "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY)); |
| } else { |
| |
| print ("Computing the pairs counts..."); |
| |
| orig_min_spY = min (spY); |
| orig_min_prY = min (prY); |
| spY = spY + (1 - orig_min_spY); |
| prY = prY + (1 - orig_min_prY); |
| |
| spYprY_row_counts = table (spY, prY); |
| spY_row_counts = rowSums (spYprY_row_counts); |
| prY_row_counts = t(colSums (spYprY_row_counts)); |
| |
| # Count all pairs of rows having the same (spY, prY)-values |
| spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2; |
| |
| # Count all pairs of rows having the same spY-values |
| spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2; |
| # Count all pairs of rows having the same prY-values |
| prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2; |
| |
| num_pairs = num_records * (num_records - 1.0) / 2.0; |
| |
| num_TP_pairs = sum (spYprY_pair_counts); |
| num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs; |
| num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs; |
| num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs; |
| |
| pct_TP_pairs = num_TP_pairs / num_pairs * 100.0; |
| pct_TN_pairs = num_TN_pairs / num_pairs * 100.0; |
| pct_FP_pairs = num_FP_pairs / num_pairs * 100.0; |
| pct_FN_pairs = num_FN_pairs / num_pairs * 100.0; |
| |
| if (is_str_empty) { |
| str = "TRUE_SAME_CT,," + num_TP_pairs; |
| is_str_empty = FALSE; |
| } else { |
| str = append (str, "TRUE_SAME_CT,," + num_TP_pairs); |
| } |
| str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs); |
| str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs); |
| str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs); |
| str = append (str, "FALSE_SAME_CT,," + num_FP_pairs); |
| str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs); |
| str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs); |
| str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs); |
| |
| pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0; |
| pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0; |
| pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0; |
| pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0; |
| |
| space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";} |
| space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";} |
| space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";} |
| space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";} |
| |
| print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP |
| + pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")"); |
| print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN |
| + pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")"); |
| print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP |
| + pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")"); |
| print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN |
| + pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")"); |
| |
| [spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] = |
| get_best_assignments (spYprY_row_counts); |
| |
| print (" "); |
| print ("Specified Categories versus Predicted Clusters:"); |
| |
| spY_cids = spY_cids + orig_min_spY - 1; |
| prY_cids = prY_cids + orig_min_prY - 1; |
| |
| for (i in 1 : nrow (spY_cids)) |
| { |
| cid = as.integer (as.scalar (spY_cids [i, 1])); |
| pct = as.scalar (rounded_percentages [i, 1]); |
| space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";} |
| print ("Category " + cid + |
| ": best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) + |
| "; full count = " + as.integer (as.scalar (full_counts [i, 1])) + |
| ", matching count = " + space_pct + pct + "% (" + |
| as.integer (as.scalar (matching_counts [i, 1])) + ")"); |
| |
| str = append (str, "SPEC_TO_PRED," + cid + "," + as.scalar (prY_cids [i, 1])); |
| str = append (str, "SPEC_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1])); |
| str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1])); |
| str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1])); |
| } |
| |
| [prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] = |
| get_best_assignments (t(spYprY_row_counts)); |
| |
| print (" "); |
| print ("Predicted Clusters versus Specified Categories:"); |
| |
| prY_cids = prY_cids + orig_min_prY - 1; |
| spY_cids = spY_cids + orig_min_spY - 1; |
| |
| for (i in 1 : nrow (prY_cids)) |
| { |
| cid = as.integer (as.scalar (prY_cids [i, 1])); |
| pct = as.scalar (rounded_percentages [i, 1]); |
| space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";} |
| print ("Cluster " + cid + |
| ": best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) + |
| "; full count = " + as.integer (as.scalar (full_counts [i, 1])) + |
| ", matching count = " + space_pct + pct + "% (" + |
| as.integer (as.scalar (matching_counts [i, 1])) + ")"); |
| |
| str = append (str, "PRED_TO_SPEC," + cid + "," + as.scalar (spY_cids [i, 1])); |
| str = append (str, "PRED_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1])); |
| str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1])); |
| str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1])); |
| } |
| |
| print (" "); |
| }}} |
| |
| if ((fileO != " ") & (! is_str_empty)) { |
| write (str, fileO); |
| } |
| |
| print ("DONE: K-MEANS SCORING SCRIPT"); |
| |
| |
| |
| get_best_assignments = function (Matrix[double] counts) |
| return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins, |
| Matrix[double] max_counts, Matrix[double] rounded_percentages) |
| { |
| margins = rowSums (counts); |
| select_positive = removeEmpty (target = diag (margins > 0), margin = "rows"); |
| row_ids = select_positive %*% seq (1, nrow (margins), 1); |
| pos_counts = select_positive %*% counts; |
| pos_margins = select_positive %*% margins; |
| max_counts = rowMaxs (pos_counts); |
| is_max_count = (pos_counts == max_counts); |
| aggr_is_max_count = t(cumsum (t(is_max_count))); |
| col_ids = rowSums (aggr_is_max_count == 0) + 1; |
| rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0; |
| } |
| |