| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) TESTING |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # nr Int 100000 Number of records in the generated dataset |
| # nf Int 10 Number of features in the X and the Y parts of the generated dataset |
| # smin Int 10000 Minimum stratum value, a positive integer |
| # smax Int 20000 Maximum stratum value, a positive integer |
| # prs Double 100.0 How many times more likely to have minimum vs. maximum stratum value |
| # pxnan Double 0.05 Probability of a NaN replacing a value in X |
| # pynan Double 0.05 Probability of a NaN replacing a value in Y |
| # psnan Double 0.05 Probability of a NaN replacing a value in the stratum column |
| # -------------------------------------------------------------------------------------------- |
| # mxmin Double 10.0 Baseline (mean) value for the first feature in X |
| # mxmax Double 19.0 Baseline (mean) value for the last feature in X |
| # mymin Double 30.0 Baseline (mean) value for the first feature in Y (before adding X) |
| # mymax Double 39.0 Baseline (mean) value for the last feature in Y (before adding X) |
| # bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the first feature |
| # bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the last feature |
| # -------------------------------------------------------------------------------------------- |
| # sxbmin Double 3.0 Standard deviation for the first feature in X, stratum dependent |
| # sxbmax Double 3.0 Standard deviation for the last feature in X, stratum dependent |
| # sxwmin Double 4.0 Standard deviation for the first feature in X, residual |
| # sxwmax Double 4.0 Standard deviation for the last feature in X, residual |
| # sybmin Double sqrt(28) Standard deviation for the first feature in Y, stratum dependent |
| # sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum dependent |
| # sywmin Double 6.0 Standard deviation for the first feature in Y, residual |
| # sywmax Double 6.0 Standard deviation for the last feature in Y, residual |
| # -------------------------------------------------------------------------------------------- |
| # D String "Data" Location (on HDFS) to store the generated dataset |
| # Xcid String "Xcid" Location (on HDFS) to store the column indices of X features |
| # Ycid String "Ycid" Location (on HDFS) to store the column indices of Y features |
| # A String "Aux" Location (on HDFS) to store the auxiliary parameter values, if any |
| # fmt String "text" Matrix output format, usually "text", "mm", or "csv" |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other auxiliaries |
| |
| num_records = ifdef ($nr, 100000); |
| num_features = ifdef ($nf, 10); |
| min_stratumID = ifdef ($smin, 10000); |
| max_stratumID = ifdef ($smax, 20000); |
| prob_ratio_min_to_max_stratumID = ifdef ($prs, 100); |
| prob_NaN_in_X = ifdef ($pxnan, 0.05); |
| prob_NaN_in_Y = ifdef ($pynan, 0.05); |
| prob_NaN_in_stratum = ifdef ($psnan, 0.05); |
| |
| mean_X_min = ifdef ($mxmin, 31.0); |
| mean_X_max = ifdef ($mxmax, 40.0); |
| mean_Y_min = ifdef ($mymin, 11.0); |
| mean_Y_max = ifdef ($mymax, 20.0); |
| beta_min = ifdef ($bmin, 3.0); |
| beta_max = ifdef ($bmax, 3.0); |
| |
| stdev_X_between_strata_min = ifdef ($sxbmin, 3.0); |
| stdev_X_between_strata_max = ifdef ($sxbmax, 3.0); |
| stdev_X_within_strata_min = ifdef ($sxwmin, 4.0); |
| stdev_X_within_strata_max = ifdef ($sxwmax, 4.0); |
| stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0)); |
| stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0)); |
| stdev_Y_within_strata_min = ifdef ($sywmin, 6.0); |
| stdev_Y_within_strata_max = ifdef ($sywmax, 6.0); |
| |
| fileData = ifdef ($D, "Data"); |
| fileXcid = ifdef ($Xcid, "Xcid"); |
| fileYcid = ifdef ($Ycid, "Ycid"); |
| fileAux = ifdef ($A, "Aux" ); |
| fmt = ifdef ($fmt, "text"); |
| |
| # Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as multinomial |
| # in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a factor of |
| # prob_ratio_min_to_max_stratumID |
| |
| r_power = (max_stratumID - min_stratumID) / log (prob_ratio_min_to_max_stratumID); |
| r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - min_stratumID)); |
| |
| if (r_bound < 1.0) { |
| R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); |
| R_S = r_bound + R_S * (1.0-r_bound); |
| } else { |
| R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); |
| R_S = 1.0 + R_S * (r_bound-1); |
| } |
| |
| SID = round (0.5 + log (R_S) * r_power); |
| num_strata = max (SID); |
| Smap = table (SID, seq (1, num_records, 1)); |
| |
| # Compute baseline values and standard deviations of X, Y, and beta, at each feature |
| |
| mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| |
| stdev_X_within_strata = stdev_X_within_strata_min + |
| ((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| stdev_X_between_strata = stdev_X_between_strata_min + |
| ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| stdev_Y_within_strata = stdev_Y_within_strata_min + |
| ((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| stdev_Y_between_strata = stdev_Y_between_strata_min + |
| ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1); |
| |
| # Generate X and Y matrices |
| |
| RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # transposed |
| RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # to allow |
| RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # matrix-vector |
| RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # operations |
| |
| t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata + mean_X) %*% Smap; |
| t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata + mean_Y) %*% Smap + (t_X * betas); |
| Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y)); |
| |
| # Set up the NaNs |
| |
| RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = prob_NaN_in_stratum); |
| RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_X); |
| RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_Y); |
| Mask = cbind (RNaNS, RNaNX, RNaNY) != 0; |
| Data = Data + (1.0 - Mask) / (1.0 - Mask); |
| |
| # Output the dataset and the auxiliaries |
| |
| Xcid = t(seq (2, num_features + 1, 1)); |
| Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1)); |
| Aux = cbind (mean_X, mean_Y, betas); |
| |
| write (Data, fileData, format=fmt); |
| write (Xcid, fileXcid, format=fmt); |
| write (Ycid, fileYcid, format=fmt); |
| write (Aux, fileAux, format=fmt); |
| |