scripts/datagen/genRandData4Transform.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # Generates random data to test transform with
 #
 # rows, cols: dimensions of the data matrix to be generated
 # prob_categorical: percentage of the generated cols to be categorical
 # min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
 # prob_missing: percentage of the generated (scale) cols to have missing values
 # prob_missing_cell: probability of a cell to have a missing value
 # out_X, out_missing, out_categorical: output file names
 #

 #params for size of data
 num_rows = ifdef($rows, 1000)
 num_cols = ifdef($cols, 25)

 #params for kind of cols
 prob_categorical = ifdef($prob_cat, 0.1)
 min_domain_size = ifdef($min_domain, 1)
 max_domain_size = ifdef($max_domain, 10)

 #params for missing value cols
 prob_missing_col = ifdef($prob_missing, 0.1)
 prob_missing_val = ifdef($prob_missing_cell, 0.2)

 num_scalar_cols = as.double(num_cols)
 num_categorical_cols = 0.0
 scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
 if(prob_categorical > 0){
   categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
   categorical_ind = categorical_ind < prob_categorical
   categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
   num_categorical_cols = sum(categorical_ind)
   write(categorical_col_ids, $out_categorical, format="csv")

   domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
   domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)

   categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
   categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))

   scalar_ind = 1-categorical_ind
 }

 scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
 num_scalar_cols = sum(scalar_ind)
 scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")

 if(num_categorical_cols > 0 & num_scalar_cols > 0){
   X = append(scalar_X, categorical_X)
   permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
   fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
   permut_mat = t(append(t(permut_mat), t(fill_in)))
   X = X %*% permut_mat
 }else{
   if(num_categorical_cols > 0) X = categorical_X
   else{
     if(num_scalar_cols > 0) X = scalar_X
     else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
   }
 }

 if(prob_missing_col > 0){
   missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
   missing_col_ind = missing_col_ind < prob_missing_col
   #currently only support missing value imputation for scale cols
   missing_col_ind = missing_col_ind * scalar_ind
   missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
   missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
   missing_values = missing_values < prob_missing_val
   X = append(X, missing_values)

   write(missing_col_ids, $out_missing, format="csv")
 }

 write(X, $out_X, format="csv")
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# Generates random data to test transform with
	#
	# rows, cols: dimensions of the data matrix to be generated
	# prob_categorical: percentage of the generated cols to be categorical
	# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
	# prob_missing: percentage of the generated (scale) cols to have missing values
	# prob_missing_cell: probability of a cell to have a missing value
	# out_X, out_missing, out_categorical: output file names
	#

	#params for size of data
	num_rows = ifdef($rows, 1000)
	num_cols = ifdef($cols, 25)

	#params for kind of cols
	prob_categorical = ifdef($prob_cat, 0.1)
	min_domain_size = ifdef($min_domain, 1)
	max_domain_size = ifdef($max_domain, 10)

	#params for missing value cols
	prob_missing_col = ifdef($prob_missing, 0.1)
	prob_missing_val = ifdef($prob_missing_cell, 0.2)

	num_scalar_cols = as.double(num_cols)
	num_categorical_cols = 0.0
	scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
	if(prob_categorical > 0){
	categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
	categorical_ind = categorical_ind < prob_categorical
	categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
	num_categorical_cols = sum(categorical_ind)
	write(categorical_col_ids, $out_categorical, format="csv")

	domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
	domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)

	categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
	categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))

	scalar_ind = 1-categorical_ind
	}

	scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
	num_scalar_cols = sum(scalar_ind)
	scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")

	if(num_categorical_cols > 0 & num_scalar_cols > 0){
	X = append(scalar_X, categorical_X)
	permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
	fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
	permut_mat = t(append(t(permut_mat), t(fill_in)))
	X = X %*% permut_mat
	}else{
	if(num_categorical_cols > 0) X = categorical_X
	else{
	if(num_scalar_cols > 0) X = scalar_X
	else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
	}
	}

	if(prob_missing_col > 0){
	missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
	missing_col_ind = missing_col_ind < prob_missing_col
	#currently only support missing value imputation for scale cols
	missing_col_ind = missing_col_ind * scalar_ind
	missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
	missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
	missing_values = missing_values < prob_missing_val
	X = append(X, missing_values)

	write(missing_col_ids, $out_missing, format="csv")
	}

	write(X, $out_X, format="csv")