| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # Generates random data to test transform with |
| # |
| # rows, cols: dimensions of the data matrix to be generated |
| # prob_categorical: percentage of the generated cols to be categorical |
| # min_domain, max_domain: provide a range for domain sizes of the generated categorical cols |
| # prob_missing: percentage of the generated (scale) cols to have missing values |
| # prob_missing_cell: probability of a cell to have a missing value |
| # out_X, out_missing, out_categorical: output file names |
| # |
| |
| #params for size of data |
| num_rows = ifdef($rows, 1000) |
| num_cols = ifdef($cols, 25) |
| |
| #params for kind of cols |
| prob_categorical = ifdef($prob_cat, 0.1) |
| min_domain_size = ifdef($min_domain, 1) |
| max_domain_size = ifdef($max_domain, 10) |
| |
| #params for missing value cols |
| prob_missing_col = ifdef($prob_missing, 0.1) |
| prob_missing_val = ifdef($prob_missing_cell, 0.2) |
| |
| num_scalar_cols = as.double(num_cols) |
| num_categorical_cols = 0.0 |
| scalar_ind = matrix(1, rows=num_scalar_cols, cols=1) |
| if(prob_categorical > 0){ |
| categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") |
| categorical_ind = categorical_ind < prob_categorical |
| categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows") |
| num_categorical_cols = sum(categorical_ind) |
| write(categorical_col_ids, $out_categorical, format="csv") |
| |
| domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform") |
| domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes) |
| |
| categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform") |
| categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1))) |
| |
| scalar_ind = 1-categorical_ind |
| } |
| |
| scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows") |
| num_scalar_cols = sum(scalar_ind) |
| scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform") |
| |
| if(num_categorical_cols > 0 & num_scalar_cols > 0){ |
| X = cbind(scalar_X, categorical_X) |
| permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols) |
| fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols) |
| permut_mat = t(cbind(t(permut_mat), t(fill_in))) |
| X = X %*% permut_mat |
| }else{ |
| if(num_categorical_cols > 0) X = categorical_X |
| else{ |
| if(num_scalar_cols > 0) X = scalar_X |
| else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale") |
| } |
| } |
| |
| if(prob_missing_col > 0){ |
| missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") |
| missing_col_ind = missing_col_ind < prob_missing_col |
| #currently only support missing value imputation for scale cols |
| missing_col_ind = missing_col_ind * scalar_ind |
| missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows") |
| missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform") |
| missing_values = missing_values < prob_missing_val |
| X = cbind(X, missing_values) |
| |
| write(missing_col_ids, $out_missing, format="csv") |
| } |
| |
| write(X, $out_X, format="csv") |