src/test/scripts/applications/apply-transform/apply-transform.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 cmdLine_missing_value_maps = ifdef($missing_value_maps, " ")
 cmdLine_bin_defns = ifdef($bin_defns, " ")
 cmdLine_dummy_code_maps = ifdef($dummy_code_maps, " ")
 cmdLine_normalization_maps = ifdef($normalization_maps, " ")

 original_X = read($X)

 if(cmdLine_missing_value_maps != " "){
 	missing_val_maps = read(cmdLine_missing_value_maps)

 	last_data_col = ncol(original_X)-nrow(missing_val_maps)
 	X = original_X[,1:last_data_col]
 }else
 	X = original_X

 # col 1: col index of missing indicator col
 #		 0 otherwise
 # col 2: global mean if imputation is needed
 # col 3: num_bins if binning is required
 # col 4: bin width if binning is required
 # col 5: min val if binning is required
 # col 6: begin col if dummy coding is required
 # col 7: end col if dummy coding is required
 # col 8: 1 if normalization is required 0 ow
 # col 9: mean for normalization
 # col 10: std for z-scoring for normalization
 #		 -1 indicates mean subtraction
 attrinfo = matrix(0, rows=ncol(X), cols=10)

 if(cmdLine_missing_value_maps != " "){
 	missing_indicator_mat = original_X[,(last_data_col+1):ncol(original_X)]

 	parfor(i in 1:nrow(missing_val_maps), check=0){
 		attr_index_mv = as.scalar(missing_val_maps[i,1])
 		attrinfo[attr_index_mv,1] = i
 		attrinfo[attr_index_mv,2] = missing_val_maps[i,2]
 	}
 }

 if(cmdLine_bin_defns != " "){
 	bin_defns = read(cmdLine_bin_defns)
 	parfor(i in 1:nrow(bin_defns), check=0){
 		attr_index_bin = as.scalar(bin_defns[i,1])
 		attrinfo[attr_index_bin,3] = bin_defns[i,4]
 		attrinfo[attr_index_bin,4] = bin_defns[i,2]
 		attrinfo[attr_index_bin,5] = bin_defns[i,3]
 	}
 }

 if(cmdLine_dummy_code_maps != " "){
 	dummy_code_maps = read(cmdLine_dummy_code_maps)
 	parfor(i in 1:nrow(dummy_code_maps), check=0){
 		attr_index_dc = as.scalar(dummy_code_maps[i,1])
 		attrinfo[attr_index_dc,6] = dummy_code_maps[i,2]
 		attrinfo[attr_index_dc,7] = dummy_code_maps[i,3]
 	}
 }else{
 	attrinfo[,6] = seq(1, ncol(X), 1)
 	attrinfo[,7] = seq(1, ncol(X), 1)
 }

 if(cmdLine_normalization_maps != " "){
 	normalization_map = read(cmdLine_normalization_maps)
 	parfor(i in 1:nrow(normalization_map), check=0){
 		attr_index_normalization = as.scalar(normalization_map[i,1])
 		attrinfo[attr_index_normalization,8] = 1
 		attrinfo[attr_index_normalization,9] = as.scalar(normalization_map[i,2])
 		attrinfo[attr_index_normalization,10] = as.scalar(normalization_map[i,3])
 	}
 }

 #write(attrinfo, "binning/attrinfo.mtx", format="csv")

 cols_in_transformed_X = as.scalar(attrinfo[nrow(attrinfo),6])
 new_X = matrix(0, rows=nrow(X), cols=cols_in_transformed_X)
 log = matrix(0, rows=ncol(X), cols=2)
 parfor(i in 1:ncol(X), check=0){
 	col = X[,i]

 	mv_col_id = as.scalar(attrinfo[i,1])
 	global_mean = as.scalar(attrinfo[i,2])
 	num_bins = as.scalar(attrinfo[i,3])
 	bin_width = as.scalar(attrinfo[i,4])
 	min_val = as.scalar(attrinfo[i,5])
 	dummy_coding_beg_col = as.scalar(attrinfo[i,6])
 	dummy_coding_end_col = as.scalar(attrinfo[i,7])
 	normalization_needed = as.scalar(attrinfo[i,8])
 	normalization_mean = as.scalar(attrinfo[i,9])
 	normalization_std = as.scalar(attrinfo[i,10])

 	if(mv_col_id > 0){
 		# fill-in with global mean
 		col = col + missing_indicator_mat[,mv_col_id] * global_mean
 	}

 	if(num_bins > 0){
 		# only for equiwidth bins

 		# note that max_val entries will get assigned num_bins+1
 		col = round((col - min_val)/bin_width - 0.5) + 1
 		less_than_lb = (col < 1)
 		more_than_ub = (col > num_bins)

 		col = (1 - less_than_lb - more_than_ub)*col + more_than_ub*num_bins + less_than_lb
 	}

 	if(dummy_coding_beg_col == dummy_coding_end_col){
 		if(normalization_needed == 1){
 			if(normalization_std == -1) col = col - normalization_mean
 			else col = (col - normalization_mean)/normalization_std
 		}

 		new_X[,dummy_coding_beg_col] = col
 	}else{
 		min_val = min(col)
 		max_val = max(col)
 		if(min_val >= 1 & max_val <= dummy_coding_end_col - dummy_coding_beg_col + 1){
 			res = table(seq(1, nrow(X), 1), col, nrow(X), (dummy_coding_end_col-dummy_coding_beg_col+1))
 			new_X[,dummy_coding_beg_col:dummy_coding_end_col] = res
 		}else{
 			log[i,1] = 1
 			if(min_val < 1) log[i,2] = min_val
 			else log[i,2] = max_val
 		}
 	}
 }

 write(new_X, $transformed_X, format="text")

 s = "Warning Messages"
 for(i in 1:nrow(log)){
 	if(as.scalar(log[i,1]) == 1)
 		s = append(s, "Unseen value in column " + i + " (" + as.scalar(log[i,2]) + ")")
 }
 write(s, $Log)
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	cmdLine_missing_value_maps = ifdef($missing_value_maps, " ")
	cmdLine_bin_defns = ifdef($bin_defns, " ")
	cmdLine_dummy_code_maps = ifdef($dummy_code_maps, " ")
	cmdLine_normalization_maps = ifdef($normalization_maps, " ")

	original_X = read($X)

	if(cmdLine_missing_value_maps != " "){
	missing_val_maps = read(cmdLine_missing_value_maps)

	last_data_col = ncol(original_X)-nrow(missing_val_maps)
	X = original_X[,1:last_data_col]
	}else
	X = original_X

	# col 1: col index of missing indicator col
	# 0 otherwise
	# col 2: global mean if imputation is needed
	# col 3: num_bins if binning is required
	# col 4: bin width if binning is required
	# col 5: min val if binning is required
	# col 6: begin col if dummy coding is required
	# col 7: end col if dummy coding is required
	# col 8: 1 if normalization is required 0 ow
	# col 9: mean for normalization
	# col 10: std for z-scoring for normalization
	# -1 indicates mean subtraction
	attrinfo = matrix(0, rows=ncol(X), cols=10)

	if(cmdLine_missing_value_maps != " "){
	missing_indicator_mat = original_X[,(last_data_col+1):ncol(original_X)]

	parfor(i in 1:nrow(missing_val_maps), check=0){
	attr_index_mv = as.scalar(missing_val_maps[i,1])
	attrinfo[attr_index_mv,1] = i
	attrinfo[attr_index_mv,2] = missing_val_maps[i,2]
	}
	}

	if(cmdLine_bin_defns != " "){
	bin_defns = read(cmdLine_bin_defns)
	parfor(i in 1:nrow(bin_defns), check=0){
	attr_index_bin = as.scalar(bin_defns[i,1])
	attrinfo[attr_index_bin,3] = bin_defns[i,4]
	attrinfo[attr_index_bin,4] = bin_defns[i,2]
	attrinfo[attr_index_bin,5] = bin_defns[i,3]
	}
	}

	if(cmdLine_dummy_code_maps != " "){
	dummy_code_maps = read(cmdLine_dummy_code_maps)
	parfor(i in 1:nrow(dummy_code_maps), check=0){
	attr_index_dc = as.scalar(dummy_code_maps[i,1])
	attrinfo[attr_index_dc,6] = dummy_code_maps[i,2]
	attrinfo[attr_index_dc,7] = dummy_code_maps[i,3]
	}
	}else{
	attrinfo[,6] = seq(1, ncol(X), 1)
	attrinfo[,7] = seq(1, ncol(X), 1)
	}

	if(cmdLine_normalization_maps != " "){
	normalization_map = read(cmdLine_normalization_maps)
	parfor(i in 1:nrow(normalization_map), check=0){
	attr_index_normalization = as.scalar(normalization_map[i,1])
	attrinfo[attr_index_normalization,8] = 1
	attrinfo[attr_index_normalization,9] = as.scalar(normalization_map[i,2])
	attrinfo[attr_index_normalization,10] = as.scalar(normalization_map[i,3])
	}
	}

	#write(attrinfo, "binning/attrinfo.mtx", format="csv")

	cols_in_transformed_X = as.scalar(attrinfo[nrow(attrinfo),6])
	new_X = matrix(0, rows=nrow(X), cols=cols_in_transformed_X)
	log = matrix(0, rows=ncol(X), cols=2)
	parfor(i in 1:ncol(X), check=0){
	col = X[,i]

	mv_col_id = as.scalar(attrinfo[i,1])
	global_mean = as.scalar(attrinfo[i,2])
	num_bins = as.scalar(attrinfo[i,3])
	bin_width = as.scalar(attrinfo[i,4])
	min_val = as.scalar(attrinfo[i,5])
	dummy_coding_beg_col = as.scalar(attrinfo[i,6])
	dummy_coding_end_col = as.scalar(attrinfo[i,7])
	normalization_needed = as.scalar(attrinfo[i,8])
	normalization_mean = as.scalar(attrinfo[i,9])
	normalization_std = as.scalar(attrinfo[i,10])

	if(mv_col_id > 0){
	# fill-in with global mean
	col = col + missing_indicator_mat[,mv_col_id] * global_mean
	}

	if(num_bins > 0){
	# only for equiwidth bins

	# note that max_val entries will get assigned num_bins+1
	col = round((col - min_val)/bin_width - 0.5) + 1
	less_than_lb = (col < 1)
	more_than_ub = (col > num_bins)

	col = (1 - less_than_lb - more_than_ub)col + more_than_ubnum_bins + less_than_lb
	}

	if(dummy_coding_beg_col == dummy_coding_end_col){
	if(normalization_needed == 1){
	if(normalization_std == -1) col = col - normalization_mean
	else col = (col - normalization_mean)/normalization_std
	}

	new_X[,dummy_coding_beg_col] = col
	}else{
	min_val = min(col)
	max_val = max(col)
	if(min_val >= 1 & max_val <= dummy_coding_end_col - dummy_coding_beg_col + 1){
	res = table(seq(1, nrow(X), 1), col, nrow(X), (dummy_coding_end_col-dummy_coding_beg_col+1))
	new_X[,dummy_coding_beg_col:dummy_coding_end_col] = res
	}else{
	log[i,1] = 1
	if(min_val < 1) log[i,2] = min_val
	else log[i,2] = max_val
	}
	}
	}

	write(new_X, $transformed_X, format="text")

	s = "Warning Messages"
	for(i in 1:nrow(log)){
	if(as.scalar(log[i,1]) == 1)
	s = append(s, "Unseen value in column " + i + " (" + as.scalar(log[i,2]) + ")")
	}
	write(s, $Log)