| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # Generates random Gaussian-mixture data to test k-Means clustering algorithms |
| # |
| # INPUT PARAMETERS: |
| # ---------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # ---------------------------------------------------------------------------- |
| # nr Int --- Number of records |
| # nf Int --- Number of features |
| # nc Int --- Number of clusters |
| # dc Double --- St.dev. of cluster "centroid" features from zero mean |
| # dr Double --- St.dev. of the 1-st feature in a record within cluster |
| # fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature |
| # cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus] |
| # X String --- Location to write matrix X with generated data records |
| # C String --- Location to write cluster "centroids" (Gaussian means) |
| # Y String --- Location to write assignment of records to cluster ids |
| # YbyC String --- Location to write rec-cluster assigns by min-dist to C |
| # ---------------------------------------------------------------------------- |
| # |
| # Example: |
| # hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100 |
| # nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx |
| |
| print ("BEGIN K-MEANS GENERATOR SCRIPT"); |
| |
| num_records = $nr; |
| num_features = $nf; |
| num_centroids = $nc; |
| dist_per_feature_centroids = $dc; |
| dist_per_feature_first_record = $dr; |
| feature_bias_factor = $fbf; |
| cluster_bias_factor = $cbf; |
| |
| fileX = ifdef ($X, "X"); |
| fileC = ifdef ($C, "C"); |
| fileY = ifdef ($Y, "Y"); |
| fileYbyC = ifdef ($YbyC, "YbyC"); |
| fmt = ifdef ($fmt, "text"); |
| |
| print ("Generating cluster distribution (mixture) centroids..."); |
| |
| C = Rand (rows = num_centroids, cols = num_features, pdf = "normal"); |
| C = C * dist_per_feature_centroids; |
| |
| print ("Generating record-to-cluster assignments..."); |
| |
| # Y is a multinomial in {1, ..., num_centroids} with 1 being more likely |
| # than "num_centroids" by the factor of "cluster_bias_factor" |
| |
| rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); |
| if (cluster_bias_factor == 1.0) { |
| Y = round (0.5 + rnd * num_centroids); |
| } else { |
| rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1))); |
| Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor)); |
| } |
| |
| print ("Generating within-cluster random shifts..."); |
| |
| X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal"); |
| feature_factors = dist_per_feature_first_record * |
| exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor)); |
| X_shift = X_shift %*% diag (feature_factors); |
| |
| print ("Generating records by shifting from centroids..."); |
| |
| Y_bitmap_raw = table (seq (1, num_records), Y); |
| Y_bitmap = matrix (0, rows = num_records, cols = num_centroids); |
| Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw; |
| X = Y_bitmap %*% C + X_shift; |
| |
| print ("Computing record-to-cluster assignments by minimum centroid distance..."); |
| |
| D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2)); |
| P = (D <= rowMins (D)); |
| aggr_P = t(cumsum (t(P))); |
| Y_by_C = rowSums (aggr_P == 0) + 1; |
| |
| print ("Computing useful statistics..."); |
| |
| sumXsq = sum (X ^ 2); |
| default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records; |
| attained_wcss = sumXsq + sum (rowMins (D)); |
| |
| print ("Default (single-cluster) WCSS = " + default_wcss); |
| print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss); |
| |
| print ("Writing out the resulting dataset..."); |
| |
| write (X, fileX, format = fmt); |
| write (C, fileC, format = fmt); |
| write (Y, fileY, format = fmt); |
| write (Y_by_C, fileYbyC, format = fmt); |
| |
| print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC); |
| |
| print ("DONE: K-MEANS GENERATOR SCRIPT"); |
| |