blob: 6a4c07f7341255541a11e3f2aa47513f21c670df [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) TESTING
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# nr Int 100000 Number of records in the generated dataset
# nf Int 10 Number of features in the X and the Y parts of the generated dataset
# smin Int 10000 Minimum stratum value, a positive integer
# smax Int 20000 Maximum stratum value, a positive integer
# prs Double 100.0 How many times more likely to have minimum vs. maximum stratum value
# pxnan Double 0.05 Probability of a NaN replacing a value in X
# pynan Double 0.05 Probability of a NaN replacing a value in Y
# psnan Double 0.05 Probability of a NaN replacing a value in the stratum column
# --------------------------------------------------------------------------------------------
# mxmin Double 10.0 Baseline (mean) value for the first feature in X
# mxmax Double 19.0 Baseline (mean) value for the last feature in X
# mymin Double 30.0 Baseline (mean) value for the first feature in Y (before adding X)
# mymax Double 39.0 Baseline (mean) value for the last feature in Y (before adding X)
# bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the first feature
# bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the last feature
# --------------------------------------------------------------------------------------------
# sxbmin Double 3.0 Standard deviation for the first feature in X, stratum dependent
# sxbmax Double 3.0 Standard deviation for the last feature in X, stratum dependent
# sxwmin Double 4.0 Standard deviation for the first feature in X, residual
# sxwmax Double 4.0 Standard deviation for the last feature in X, residual
# sybmin Double sqrt(28) Standard deviation for the first feature in Y, stratum dependent
# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum dependent
# sywmin Double 6.0 Standard deviation for the first feature in Y, residual
# sywmax Double 6.0 Standard deviation for the last feature in Y, residual
# --------------------------------------------------------------------------------------------
# D String "Data" Location (on HDFS) to store the generated dataset
# Xcid String "Xcid" Location (on HDFS) to store the column indices of X features
# Ycid String "Ycid" Location (on HDFS) to store the column indices of Y features
# A String "Aux" Location (on HDFS) to store the auxiliary parameter values, if any
# fmt String "text" Matrix output format, usually "text", "mm", or "csv"
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other auxiliaries
num_records = ifdef ($nr, 100000);
num_features = ifdef ($nf, 10);
min_stratumID = ifdef ($smin, 10000);
max_stratumID = ifdef ($smax, 20000);
prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
prob_NaN_in_X = ifdef ($pxnan, 0.05);
prob_NaN_in_Y = ifdef ($pynan, 0.05);
prob_NaN_in_stratum = ifdef ($psnan, 0.05);
mean_X_min = ifdef ($mxmin, 31.0);
mean_X_max = ifdef ($mxmax, 40.0);
mean_Y_min = ifdef ($mymin, 11.0);
mean_Y_max = ifdef ($mymax, 20.0);
beta_min = ifdef ($bmin, 3.0);
beta_max = ifdef ($bmax, 3.0);
stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
stdev_X_within_strata_min = ifdef ($sxwmin, 4.0);
stdev_X_within_strata_max = ifdef ($sxwmax, 4.0);
stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
stdev_Y_within_strata_min = ifdef ($sywmin, 6.0);
stdev_Y_within_strata_max = ifdef ($sywmax, 6.0);
fileData = ifdef ($D, "Data");
fileXcid = ifdef ($Xcid, "Xcid");
fileYcid = ifdef ($Ycid, "Ycid");
fileAux = ifdef ($A, "Aux" );
fmt = ifdef ($fmt, "text");
# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as multinomial
# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a factor of
# prob_ratio_min_to_max_stratumID
r_power = (max_stratumID - min_stratumID) / log (prob_ratio_min_to_max_stratumID);
r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - min_stratumID));
if (r_bound < 1.0) {
R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
R_S = r_bound + R_S * (1.0-r_bound);
} else {
R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
R_S = 1.0 + R_S * (r_bound-1);
}
SID = round (0.5 + log (R_S) * r_power);
num_strata = max (SID);
Smap = table (SID, seq (1, num_records, 1));
# Compute baseline values and standard deviations of X, Y, and beta, at each feature
mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
stdev_X_within_strata = stdev_X_within_strata_min +
((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
stdev_X_between_strata = stdev_X_between_strata_min +
((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
stdev_Y_within_strata = stdev_Y_within_strata_min +
((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1);
stdev_Y_between_strata = stdev_Y_between_strata_min +
((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1);
# Generate X and Y matrices
RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # transposed
RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # to allow
RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # matrix-vector
RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # operations
t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata + mean_X) %*% Smap;
t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata + mean_Y) %*% Smap + (t_X * betas);
Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));
# Set up the NaNs
RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = prob_NaN_in_stratum);
RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_X);
RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_Y);
Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
Data = Data + (1.0 - Mask) / (1.0 - Mask);
# Output the dataset and the auxiliaries
Xcid = t(seq (2, num_features + 1, 1));
Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
Aux = cbind (mean_X, mean_Y, betas);
write (Data, fileData, format=fmt);
write (Xcid, fileXcid, format=fmt);
write (Ycid, fileYcid, format=fmt);
write (Aux, fileAux, format=fmt);