blob: b06a7d2eefa671d6b506421d77d92ba96d0781d1 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Compares two categorical data vectors (presumed to be clusterings) and
# counts matching/nonmatching same-cluster/different-cluster pairs of rows
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------
# spY String " " Location to read a column-vector with the "specified"
# assignment of records (rows) to categories (clusters)
# prY String " " Location to read (or write, if X and C are present) a
# column-vector with the "predicted" assignment of rows
# to clusters. NOTE: The same category may be labeled
# differently in each of the two vectors, spY and prY.
# fmt String "text" Matrix output format for prY, usually "text" or "csv"
# X String " " Location to read matrix X with the input data records
# C String " " Location to read matrix C with the cluster centroids
# NOTE: If X and C are present, prY is an output file.
# O String " " Location to write the printed output statistics
# ---------------------------------------------------------------------------
#
# The "O" file provides the output statistics in CSV format, one per line, in
# the following format: NAME, [CID], VALUE. Note:
# - The 1st group statistics are given if X input is available;
# - The 2nd group statistics are given if X and C inputs are available;
# - The 3rd and 4th group statistics are given if spY input is available;
# - Only the 4th group statistics contain a nonempty CID value;
# - When present, CID contains either the specified category label or the
# predicted cluster label.
#
# NAME CID MEANING
# ---------------------------------------------------------------------------
# TSS Total Sum of Squares (from the total mean)
# WCSS_M Within-Cluster Sum of Squares (means as centers)
# WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS
# BCSS_M Between-Cluster Sum of Squares (means as centers)
# BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS
#
# WCSS_C Within-Cluster Sum of Squares (centroids as centers)
# WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS
# BCSS_C Between-Cluster Sum of Squares (centroids as centers)
# BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS
#
# TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count
# TRUE_SAME_PC Same-category pairs predicted as Same-cluster, %
# TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count
# TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, %
# FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count
# FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, %
# FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count
# FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, %
#
# SPEC_TO_PRED + For specified category, the best predicted cluster id
# SPEC_FULL_CT + For specified category, its full count
# SPEC_MATCH_CT + For specified category, best-cluster matching count
# SPEC_MATCH_PC + For specified category, % of matching to full count
# PRED_TO_SPEC + For predicted cluster, the best specified category id
# PRED_FULL_CT + For predicted cluster, its full count
# PRED_MATCH_CT + For predicted cluster, best-category matching count
# PRED_MATCH_PC + For predicted cluster, % of matching to full count
# ---------------------------------------------------------------------------
#
# Examples:
# 1. To predict Y given X and C:
# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
# C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
# 2. To compare "actual" labels spY with "predicted" labels given X and C:
# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
# C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
# 3. To compare "actual" labels spY with given "predicted" labels prY:
# hadoop jar SystemDS.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
# prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats
fmt_prY = ifdef ($fmt, "text");
filePrY = ifdef ($prY, " ");
fileSpY = ifdef ($spY, " ");
fileX = ifdef ($X, " ");
fileC = ifdef ($C, " ");
fileO = ifdef ($O, " ");
is_str_empty = TRUE;
str = " ";
print ("BEGIN K-MEANS SCORING SCRIPT");
if (fileX != " ") {
print ("Reading X...");
X = read (fileX);
total_mean = colSums (X) / nrow (X);
total_ss = sum( (X - total_mean)^2 );
}
if ((fileC != " ") & (fileX == " ")) {
print ("ERROR: Cannot provide C without providing X.");
} else {
if (fileC != " ") {
print ("Reading C...");
C = read (fileC);
num_clusters = nrow (C);
print ("Computing the predicted Y...");
D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
prY = rowIndexMin (D);
if (filePrY != " ") {
print ("Writing the predicted Y...");
write (prY, filePrY, format=fmt_prY);
}
} else {
print ("Reading the predicted Y...");
prY = read (filePrY);
num_clusters = max (prY);
}
if (fileX != " ") {
print ("Computing the WCSS...");
# Compute projection matrix from clusters to records
P = table (seq (1, nrow (X), 1), prY, nrow(X), num_clusters);
# Compute the means, as opposed to the centroids
cluster_sizes = t(colSums (P));
M = (t(P) %*% X) / (cluster_sizes + (cluster_sizes == 0));
# Compute the WCSS for the means
wcss_means = sum ((X - P %*% M) ^ 2);
wcss_means_pc = 100.0 * wcss_means / total_ss;
bcss_means = sum (cluster_sizes * rowSums ((M - total_mean) ^ 2));
bcss_means_pc = 100.0 * bcss_means / total_ss;
# Output results
print ("Total Sum of Squares (TSS) = " + total_ss);
print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
str = "TSS,," + total_ss;
str = append (str, "WCSS_M,," + wcss_means);
str = append (str, "WCSS_M_PC,," + wcss_means_pc);
str = append (str, "BCSS_M,," + bcss_means);
str = append (str, "BCSS_M_PC,," + bcss_means_pc);
is_str_empty = FALSE;
}
if (fileC != " ") {
# Compute the WCSS for the centroids
wcss_centroids = sum ((X - P %*% C) ^ 2);
wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
bcss_centroids = sum (cluster_sizes * rowSums ((C - total_mean) ^ 2));
bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
# Output results
print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
str = append (str, "WCSS_C,," + wcss_centroids);
str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
str = append (str, "BCSS_C,," + bcss_centroids);
str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
}
if (fileSpY != " ") {
print ("Reading the specified Y...");
spY = read (fileSpY);
num_records = nrow (spY);
if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
print ("ERROR: spY and/or prY size mismatch");
print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY)
+ "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY));
} else {
print ("Computing the pairs counts...");
orig_min_spY = min (spY);
orig_min_prY = min (prY);
spY = spY + (1 - orig_min_spY);
prY = prY + (1 - orig_min_prY);
spYprY_row_counts = table (spY, prY);
spY_row_counts = rowSums (spYprY_row_counts);
prY_row_counts = t(colSums (spYprY_row_counts));
# Count all pairs of rows having the same (spY, prY)-values
spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;
# Count all pairs of rows having the same spY-values
spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
# Count all pairs of rows having the same prY-values
prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;
num_pairs = num_records * (num_records - 1.0) / 2.0;
num_TP_pairs = sum (spYprY_pair_counts);
num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;
pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;
if (is_str_empty) {
str = "TRUE_SAME_CT,," + num_TP_pairs;
is_str_empty = FALSE;
} else {
str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
}
str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs);
str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs);
str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs);
str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);
pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;
space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";}
space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";}
space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";}
space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";}
print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
+ pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
+ pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
+ pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
+ pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");
[spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
get_best_assignments (spYprY_row_counts);
print (" ");
print ("Specified Categories versus Predicted Clusters:");
spY_cids = spY_cids + orig_min_spY - 1;
prY_cids = prY_cids + orig_min_prY - 1;
for (i in 1 : nrow (spY_cids))
{
cid = as.integer (as.scalar (spY_cids [i, 1]));
pct = as.scalar (rounded_percentages [i, 1]);
space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
print ("Category " + cid +
": best pred. cluster is " + as.integer (as.scalar (prY_cids [i, 1])) +
"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
", matching count = " + space_pct + pct + "% (" +
as.integer (as.scalar (matching_counts [i, 1])) + ")");
str = append (str, "SPEC_TO_PRED," + cid + "," + as.scalar (prY_cids [i, 1]));
str = append (str, "SPEC_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
str = append (str, "SPEC_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
str = append (str, "SPEC_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
}
[prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
get_best_assignments (t(spYprY_row_counts));
print (" ");
print ("Predicted Clusters versus Specified Categories:");
prY_cids = prY_cids + orig_min_prY - 1;
spY_cids = spY_cids + orig_min_spY - 1;
for (i in 1 : nrow (prY_cids))
{
cid = as.integer (as.scalar (prY_cids [i, 1]));
pct = as.scalar (rounded_percentages [i, 1]);
space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
print ("Cluster " + cid +
": best spec. categ. is " + as.integer (as.scalar (spY_cids [i, 1])) +
"; full count = " + as.integer (as.scalar (full_counts [i, 1])) +
", matching count = " + space_pct + pct + "% (" +
as.integer (as.scalar (matching_counts [i, 1])) + ")");
str = append (str, "PRED_TO_SPEC," + cid + "," + as.scalar (spY_cids [i, 1]));
str = append (str, "PRED_FULL_CT," + cid + "," + as.scalar (full_counts [i, 1]));
str = append (str, "PRED_MATCH_CT," + cid + "," + as.scalar (matching_counts [i, 1]));
str = append (str, "PRED_MATCH_PC," + cid + "," + as.scalar (rounded_percentages [i, 1]));
}
print (" ");
}}}
if ((fileO != " ") & (! is_str_empty)) {
write (str, fileO);
}
print ("DONE: K-MEANS SCORING SCRIPT");
get_best_assignments = function (Matrix[double] counts)
return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
Matrix[double] max_counts, Matrix[double] rounded_percentages)
{
margins = rowSums (counts);
select_positive = removeEmpty (target = diag (margins > 0), margin = "rows");
row_ids = select_positive %*% seq (1, nrow (margins), 1);
pos_counts = select_positive %*% counts;
pos_margins = select_positive %*% margins;
max_counts = rowMaxs (pos_counts);
is_max_count = (pos_counts == max_counts);
aggr_is_max_count = t(cumsum (t(is_max_count)));
col_ids = rowSums (aggr_is_max_count == 0) + 1;
rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
}