blob: 6a442992dccfa3b1f43b9bed10df13ef19e4fc11 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Generates random data to test transform with
#
# rows, cols: dimensions of the data matrix to be generated
# prob_categorical: percentage of the generated cols to be categorical
# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
# prob_missing: percentage of the generated (scale) cols to have missing values
# prob_missing_cell: probability of a cell to have a missing value
# out_X, out_missing, out_categorical: output file names
#
#params for size of data
num_rows = ifdef($rows, 1000)
num_cols = ifdef($cols, 25)
#params for kind of cols
prob_categorical = ifdef($prob_cat, 0.1)
min_domain_size = ifdef($min_domain, 1)
max_domain_size = ifdef($max_domain, 10)
#params for missing value cols
prob_missing_col = ifdef($prob_missing, 0.1)
prob_missing_val = ifdef($prob_missing_cell, 0.2)
num_scalar_cols = as.double(num_cols)
num_categorical_cols = 0.0
scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
if(prob_categorical > 0){
categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
categorical_ind = categorical_ind < prob_categorical
categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
num_categorical_cols = sum(categorical_ind)
write(categorical_col_ids, $out_categorical, format="csv")
domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)
categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
scalar_ind = 1-categorical_ind
}
scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
num_scalar_cols = sum(scalar_ind)
scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")
if(num_categorical_cols > 0 & num_scalar_cols > 0){
X = append(scalar_X, categorical_X)
permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
permut_mat = t(append(t(permut_mat), t(fill_in)))
X = X %*% permut_mat
}else{
if(num_categorical_cols > 0) X = categorical_X
else{
if(num_scalar_cols > 0) X = scalar_X
else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
}
}
if(prob_missing_col > 0){
missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
missing_col_ind = missing_col_ind < prob_missing_col
#currently only support missing value imputation for scale cols
missing_col_ind = missing_col_ind * scalar_ind
missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
missing_values = missing_values < prob_missing_val
X = append(X, missing_values)
write(missing_col_ids, $out_missing, format="csv")
}
write(X, $out_X, format="csv")