blob: 9b3c90c5458b7a5a61aa34efa7d371b744012d90 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Generates random Gaussian-mixture data to test k-Means clustering algorithms
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# nr Int --- Number of records
# nf Int --- Number of features
# nc Int --- Number of clusters
# dc Double --- St.dev. of cluster "centroid" features from zero mean
# dr Double --- St.dev. of the 1-st feature in a record within cluster
# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature
# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
# X String --- Location to write matrix X with generated data records
# C String --- Location to write cluster "centroids" (Gaussian means)
# Y String --- Location to write assignment of records to cluster ids
# YbyC String --- Location to write rec-cluster assigns by min-dist to C
# ----------------------------------------------------------------------------
#
# Example:
# hadoop jar SystemML.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx
print ("BEGIN K-MEANS GENERATOR SCRIPT");
num_records = $nr;
num_features = $nf;
num_centroids = $nc;
dist_per_feature_centroids = $dc;
dist_per_feature_first_record = $dr;
feature_bias_factor = $fbf;
cluster_bias_factor = $cbf;
fileX = ifdef ($X, "X");
fileC = ifdef ($C, "C");
fileY = ifdef ($Y, "Y");
fileYbyC = ifdef ($YbyC, "YbyC");
fmt = ifdef ($fmt, "text");
print ("Generating cluster distribution (mixture) centroids...");
C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
C = C * dist_per_feature_centroids;
print ("Generating record-to-cluster assignments...");
# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
# than "num_centroids" by the factor of "cluster_bias_factor"
rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
if (cluster_bias_factor == 1.0) {
Y = round (0.5 + rnd * num_centroids);
} else {
rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1)));
Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor));
}
print ("Generating within-cluster random shifts...");
X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
feature_factors = dist_per_feature_first_record *
exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor));
X_shift = X_shift %*% diag (feature_factors);
print ("Generating records by shifting from centroids...");
Y_bitmap_raw = table (seq (1, num_records), Y);
Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
X = Y_bitmap %*% C + X_shift;
print ("Computing record-to-cluster assignments by minimum centroid distance...");
D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
P = (D <= rowMins (D));
aggr_P = t(cumsum (t(P)));
Y_by_C = rowSums (aggr_P == 0) + 1;
print ("Computing useful statistics...");
sumXsq = sum (X ^ 2);
default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records;
attained_wcss = sumXsq + sum (rowMins (D));
print ("Default (single-cluster) WCSS = " + default_wcss);
print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss);
print ("Writing out the resulting dataset...");
write (X, fileX, format = fmt);
write (C, fileC, format = fmt);
write (Y, fileY, format = fmt);
write (Y_by_C, fileYbyC, format = fmt);
print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC);
print ("DONE: K-MEANS GENERATOR SCRIPT");