scripts/datagen/genRandData4Kmeans.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # Generates random Gaussian-mixture data to test k-Means clustering algorithms
 #
 # INPUT PARAMETERS:
 # ----------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # ----------------------------------------------------------------------------
 # nr    Int     ---     Number of records
 # nf    Int     ---     Number of features
 # nc    Int     ---     Number of clusters
 # dc    Double  ---     St.dev. of cluster "centroid" features from zero mean
 # dr    Double  ---     St.dev. of the 1-st feature in a record within cluster
 # fbf   Double  ---     Feature bias factor: Stdev(last) / Stdev(1-st) feature
 # cbf   Double  ---     Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
 # X     String  ---     Location to write matrix X with generated data records
 # C     String  ---     Location to write cluster "centroids" (Gaussian means)
 # Y     String  ---     Location to write assignment of records to cluster ids
 # YbyC  String  ---     Location to write rec-cluster assigns by min-dist to C
 # ----------------------------------------------------------------------------
 #
 # Example:
 # hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
 #     nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx

 print ("BEGIN K-MEANS GENERATOR SCRIPT");

 num_records   = $nr;
 num_features  = $nf;
 num_centroids = $nc;
 dist_per_feature_centroids = $dc;
 dist_per_feature_first_record = $dr;
 feature_bias_factor = $fbf;
 cluster_bias_factor = $cbf;

 fileX    = ifdef ($X, "X");
 fileC    = ifdef ($C, "C");
 fileY    = ifdef ($Y, "Y");
 fileYbyC = ifdef ($YbyC, "YbyC");
 fmt      = ifdef ($fmt, "text");

 print ("Generating cluster distribution (mixture) centroids...");

 C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
 C = C * dist_per_feature_centroids;

 print ("Generating record-to-cluster assignments...");

 # Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
 # than "num_centroids" by the factor of "cluster_bias_factor"

 rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
 if (cluster_bias_factor == 1.0) {
     Y = round (0.5 + rnd * num_centroids);
 } else {
     rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1)));
     Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor));
 }

 print ("Generating within-cluster random shifts...");

 X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
 feature_factors = dist_per_feature_first_record *
     exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor));
 X_shift = X_shift %*% diag (feature_factors);

 print ("Generating records by shifting from centroids...");

 Y_bitmap_raw = table (seq (1, num_records), Y);
 Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
 Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
 X = Y_bitmap %*% C + X_shift;

 print ("Computing record-to-cluster assignments by minimum centroid distance...");

 D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
 P = (D <= rowMins (D));
 aggr_P = t(cumsum (t(P)));
 Y_by_C = rowSums (aggr_P == 0) + 1;

 print ("Computing useful statistics...");

 sumXsq = sum (X ^ 2);
 default_wcss  = sumXsq - sum (colSums (X) ^ 2) / num_records;
 attained_wcss = sumXsq + sum (rowMins (D));

 print ("Default (single-cluster) WCSS = " + default_wcss);
 print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss);

 print ("Writing out the resulting dataset...");

 write (X, fileX, format = fmt);
 write (C, fileC, format = fmt);
 write (Y, fileY, format = fmt);
 write (Y_by_C, fileYbyC, format = fmt);

 print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC);

 print ("DONE: K-MEANS GENERATOR SCRIPT");
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# Generates random Gaussian-mixture data to test k-Means clustering algorithms
	#
	# INPUT PARAMETERS:
	# ----------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ----------------------------------------------------------------------------
	# nr Int --- Number of records
	# nf Int --- Number of features
	# nc Int --- Number of clusters
	# dc Double --- St.dev. of cluster "centroid" features from zero mean
	# dr Double --- St.dev. of the 1-st feature in a record within cluster
	# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature
	# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
	# X String --- Location to write matrix X with generated data records
	# C String --- Location to write cluster "centroids" (Gaussian means)
	# Y String --- Location to write assignment of records to cluster ids
	# YbyC String --- Location to write rec-cluster assigns by min-dist to C
	# ----------------------------------------------------------------------------
	#
	# Example:
	# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
	# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx

	print ("BEGIN K-MEANS GENERATOR SCRIPT");

	num_records = $nr;
	num_features = $nf;
	num_centroids = $nc;
	dist_per_feature_centroids = $dc;
	dist_per_feature_first_record = $dr;
	feature_bias_factor = $fbf;
	cluster_bias_factor = $cbf;

	fileX = ifdef ($X, "X");
	fileC = ifdef ($C, "C");
	fileY = ifdef ($Y, "Y");
	fileYbyC = ifdef ($YbyC, "YbyC");
	fmt = ifdef ($fmt, "text");

	print ("Generating cluster distribution (mixture) centroids...");

	C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
	C = C * dist_per_feature_centroids;

	print ("Generating record-to-cluster assignments...");

	# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
	# than "num_centroids" by the factor of "cluster_bias_factor"

	rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
	if (cluster_bias_factor == 1.0) {
	Y = round (0.5 + rnd * num_centroids);
	} else {
	rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1)));
	Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor));
	}

	print ("Generating within-cluster random shifts...");

	X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
	feature_factors = dist_per_feature_first_record *
	exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor));
	X_shift = X_shift %*% diag (feature_factors);

	print ("Generating records by shifting from centroids...");

	Y_bitmap_raw = table (seq (1, num_records), Y);
	Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
	Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
	X = Y_bitmap %*% C + X_shift;

	print ("Computing record-to-cluster assignments by minimum centroid distance...");

	D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
	P = (D <= rowMins (D));
	aggr_P = t(cumsum (t(P)));
	Y_by_C = rowSums (aggr_P == 0) + 1;

	print ("Computing useful statistics...");

	sumXsq = sum (X ^ 2);
	default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records;
	attained_wcss = sumXsq + sum (rowMins (D));

	print ("Default (single-cluster) WCSS = " + default_wcss);
	print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss);

	print ("Writing out the resulting dataset...");

	write (X, fileX, format = fmt);
	write (C, fileC, format = fmt);
	write (Y, fileY, format = fmt);
	write (Y_by_C, fileYbyC, format = fmt);

	print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC);

	print ("DONE: K-MEANS GENERATOR SCRIPT");