scripts/datagen/genRandData4StratStats.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) TESTING
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME   TYPE   DEFAULT  MEANING
 # --------------------------------------------------------------------------------------------
 # nr     Int    100000   Number of records in the generated dataset
 # nf     Int      10     Number of features in the X and the Y parts of the generated dataset
 # smin   Int     10000   Minimum stratum value, a positive integer
 # smax   Int     20000   Maximum stratum value, a positive integer
 # prs    Double  100.0   How many times more likely to have minimum vs. maximum stratum value
 # pxnan  Double    0.05  Probability of a NaN replacing a value in X
 # pynan  Double    0.05  Probability of a NaN replacing a value in Y
 # psnan  Double    0.05  Probability of a NaN replacing a value in the stratum column
 # --------------------------------------------------------------------------------------------
 # mxmin  Double   10.0   Baseline (mean) value for the first feature in X
 # mxmax  Double   19.0   Baseline (mean) value for the last feature in X
 # mymin  Double   30.0   Baseline (mean) value for the first feature in Y (before adding X)
 # mymax  Double   39.0   Baseline (mean) value for the last feature in Y (before adding X)
 # bmin   Double    3.0   "Beta" multiplied by X before adding to Y, for the first feature
 # bmax   Double    3.0   "Beta" multiplied by X before adding to Y, for the last feature
 # --------------------------------------------------------------------------------------------
 # sxbmin Double    3.0   Standard deviation for the first feature in X, stratum dependent
 # sxbmax Double    3.0   Standard deviation for the last feature in X, stratum dependent
 # sxwmin Double    4.0   Standard deviation for the first feature in X, residual
 # sxwmax Double    4.0   Standard deviation for the last feature in X, residual
 # sybmin Double sqrt(28) Standard deviation for the first feature in Y, stratum dependent
 # sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum dependent
 # sywmin Double    6.0   Standard deviation for the first feature in Y, residual
 # sywmax Double    6.0   Standard deviation for the last feature in Y, residual
 # --------------------------------------------------------------------------------------------
 # D      String  "Data"  Location (on HDFS) to store the generated dataset
 # Xcid   String  "Xcid"  Location (on HDFS) to store the column indices of X features
 # Ycid   String  "Ycid"  Location (on HDFS) to store the column indices of Y features
 # A      String  "Aux"   Location (on HDFS) to store the auxiliary parameter values, if any
 # fmt    String  "text"  Matrix output format, usually "text", "mm", or "csv"
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other auxiliaries

 num_records   = ifdef ($nr, 100000);
 num_features  = ifdef ($nf, 10);
 min_stratumID = ifdef ($smin, 10000);
 max_stratumID = ifdef ($smax, 20000);
 prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
 prob_NaN_in_X = ifdef ($pxnan, 0.05);
 prob_NaN_in_Y = ifdef ($pynan, 0.05);
 prob_NaN_in_stratum = ifdef ($psnan, 0.05);

 mean_X_min = ifdef ($mxmin, 31.0);
 mean_X_max = ifdef ($mxmax, 40.0);
 mean_Y_min = ifdef ($mymin, 11.0);
 mean_Y_max = ifdef ($mymax, 20.0);
 beta_min   = ifdef ($bmin,   3.0);
 beta_max   = ifdef ($bmax,   3.0);

 stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
 stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
 stdev_X_within_strata_min  = ifdef ($sxwmin, 4.0);
 stdev_X_within_strata_max  = ifdef ($sxwmax, 4.0);
 stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
 stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
 stdev_Y_within_strata_min  = ifdef ($sywmin, 6.0);
 stdev_Y_within_strata_max  = ifdef ($sywmax, 6.0);

 fileData = ifdef ($D,    "Data");
 fileXcid = ifdef ($Xcid, "Xcid");
 fileYcid = ifdef ($Ycid, "Ycid");
 fileAux  = ifdef ($A,    "Aux" );
 fmt      = ifdef ($fmt,  "text");

 # Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as multinomial
 # in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a factor of
 # prob_ratio_min_to_max_stratumID

 r_power = (max_stratumID - min_stratumID) / log (prob_ratio_min_to_max_stratumID);
 r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - min_stratumID));

 if (r_bound < 1.0) {
     R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
     R_S = r_bound + R_S * (1.0-r_bound);
 } else {
     R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
     R_S = 1.0 + R_S * (r_bound-1);
 }

 SID = round (0.5 + log (R_S) * r_power);
 num_strata = max (SID);
 Smap = table (SID, seq (1, num_records, 1));

 # Compute baseline values and standard deviations of X, Y, and beta, at each feature

 mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
 mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
 betas  =   beta_min + ((  beta_max -   beta_min) / (num_features - 1)) * seq (0, num_features - 1, 1);

 stdev_X_within_strata  = stdev_X_within_strata_min  +
     ((stdev_X_within_strata_max  - stdev_X_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
 stdev_X_between_strata = stdev_X_between_strata_min +
     ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
 stdev_Y_within_strata  = stdev_Y_within_strata_min  +
     ((stdev_Y_within_strata_max  - stdev_Y_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
 stdev_Y_between_strata = stdev_Y_between_strata_min +
     ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);

 # Generate X and Y matrices

 RX_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  # transposed
 RY_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  # to allow
 RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  # matrix-vector
 RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  # operations

 t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata + mean_X) %*% Smap;
 t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata + mean_Y) %*% Smap + (t_X * betas);
 Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));

 # Set up the NaNs

 RNaNS = Rand  (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = prob_NaN_in_stratum);
 RNaNX = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_X);
 RNaNY = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_Y);
 Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
 Data = Data + (1.0 - Mask) / (1.0 - Mask);

 # Output the dataset and the auxiliaries

 Xcid = t(seq (2, num_features + 1, 1));
 Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
 Aux = cbind (mean_X, mean_Y, betas);

 write (Data, fileData, format=fmt);
 write (Xcid, fileXcid, format=fmt);
 write (Ycid, fileYcid, format=fmt);
 write (Aux,  fileAux,  format=fmt);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) TESTING
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# nr Int 100000 Number of records in the generated dataset
	# nf Int 10 Number of features in the X and the Y parts of the generated dataset
	# smin Int 10000 Minimum stratum value, a positive integer
	# smax Int 20000 Maximum stratum value, a positive integer
	# prs Double 100.0 How many times more likely to have minimum vs. maximum stratum value
	# pxnan Double 0.05 Probability of a NaN replacing a value in X
	# pynan Double 0.05 Probability of a NaN replacing a value in Y
	# psnan Double 0.05 Probability of a NaN replacing a value in the stratum column
	# --------------------------------------------------------------------------------------------
	# mxmin Double 10.0 Baseline (mean) value for the first feature in X
	# mxmax Double 19.0 Baseline (mean) value for the last feature in X
	# mymin Double 30.0 Baseline (mean) value for the first feature in Y (before adding X)
	# mymax Double 39.0 Baseline (mean) value for the last feature in Y (before adding X)
	# bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the first feature
	# bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the last feature
	# --------------------------------------------------------------------------------------------
	# sxbmin Double 3.0 Standard deviation for the first feature in X, stratum dependent
	# sxbmax Double 3.0 Standard deviation for the last feature in X, stratum dependent
	# sxwmin Double 4.0 Standard deviation for the first feature in X, residual
	# sxwmax Double 4.0 Standard deviation for the last feature in X, residual
	# sybmin Double sqrt(28) Standard deviation for the first feature in Y, stratum dependent
	# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum dependent
	# sywmin Double 6.0 Standard deviation for the first feature in Y, residual
	# sywmax Double 6.0 Standard deviation for the last feature in Y, residual
	# --------------------------------------------------------------------------------------------
	# D String "Data" Location (on HDFS) to store the generated dataset
	# Xcid String "Xcid" Location (on HDFS) to store the column indices of X features
	# Ycid String "Ycid" Location (on HDFS) to store the column indices of Y features
	# A String "Aux" Location (on HDFS) to store the auxiliary parameter values, if any
	# fmt String "text" Matrix output format, usually "text", "mm", or "csv"
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other auxiliaries

	num_records = ifdef ($nr, 100000);
	num_features = ifdef ($nf, 10);
	min_stratumID = ifdef ($smin, 10000);
	max_stratumID = ifdef ($smax, 20000);
	prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
	prob_NaN_in_X = ifdef ($pxnan, 0.05);
	prob_NaN_in_Y = ifdef ($pynan, 0.05);
	prob_NaN_in_stratum = ifdef ($psnan, 0.05);

	mean_X_min = ifdef ($mxmin, 31.0);
	mean_X_max = ifdef ($mxmax, 40.0);
	mean_Y_min = ifdef ($mymin, 11.0);
	mean_Y_max = ifdef ($mymax, 20.0);
	beta_min = ifdef ($bmin, 3.0);
	beta_max = ifdef ($bmax, 3.0);

	stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
	stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
	stdev_X_within_strata_min = ifdef ($sxwmin, 4.0);
	stdev_X_within_strata_max = ifdef ($sxwmax, 4.0);
	stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
	stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
	stdev_Y_within_strata_min = ifdef ($sywmin, 6.0);
	stdev_Y_within_strata_max = ifdef ($sywmax, 6.0);

	fileData = ifdef ($D, "Data");
	fileXcid = ifdef ($Xcid, "Xcid");
	fileYcid = ifdef ($Ycid, "Ycid");
	fileAux = ifdef ($A, "Aux" );
	fmt = ifdef ($fmt, "text");

	# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as multinomial
	# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a factor of
	# prob_ratio_min_to_max_stratumID

	r_power = (max_stratumID - min_stratumID) / log (prob_ratio_min_to_max_stratumID);
	r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - min_stratumID));

	if (r_bound < 1.0) {
	R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
	R_S = r_bound + R_S * (1.0-r_bound);
	} else {
	R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
	R_S = 1.0 + R_S * (r_bound-1);
	}

	SID = round (0.5 + log (R_S) * r_power);
	num_strata = max (SID);
	Smap = table (SID, seq (1, num_records, 1));

	# Compute baseline values and standard deviations of X, Y, and beta, at each feature

	mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
	mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
	betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq (0, num_features - 1, 1);

	stdev_X_within_strata = stdev_X_within_strata_min +
	((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
	stdev_X_between_strata = stdev_X_between_strata_min +
	((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
	stdev_Y_within_strata = stdev_Y_within_strata_min +
	((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
	stdev_Y_between_strata = stdev_Y_between_strata_min +
	((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);

	# Generate X and Y matrices

	RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # transposed
	RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # to allow
	RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # matrix-vector
	RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # operations

	t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata + mean_X) %*% Smap;
	t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata + mean_Y) %% Smap + (t_X betas);
	Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));

	# Set up the NaNs

	RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = prob_NaN_in_stratum);
	RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_X);
	RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_Y);
	Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
	Data = Data + (1.0 - Mask) / (1.0 - Mask);

	# Output the dataset and the auxiliaries

	Xcid = t(seq (2, num_features + 1, 1));
	Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
	Aux = cbind (mean_X, mean_Y, betas);

	write (Data, fileData, format=fmt);
	write (Xcid, fileXcid, format=fmt);
	write (Ycid, fileYcid, format=fmt);
	write (Aux, fileAux, format=fmt);