blob: 2c9d0ca8fc48ae6bbdabed42e06435025681d70b [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
cmdLine_missing_value_maps = ifdef($missing_value_maps, " ")
cmdLine_bin_defns = ifdef($bin_defns, " ")
cmdLine_dummy_code_maps = ifdef($dummy_code_maps, " ")
cmdLine_normalization_maps = ifdef($normalization_maps, " ")
original_X = read($X)
if(cmdLine_missing_value_maps != " "){
missing_val_maps = read(cmdLine_missing_value_maps)
last_data_col = ncol(original_X)-nrow(missing_val_maps)
X = original_X[,1:last_data_col]
}else
X = original_X
# col 1: col index of missing indicator col
# 0 otherwise
# col 2: global mean if imputation is needed
# col 3: num_bins if binning is required
# col 4: bin width if binning is required
# col 5: min val if binning is required
# col 6: begin col if dummy coding is required
# col 7: end col if dummy coding is required
# col 8: 1 if normalization is required 0 ow
# col 9: mean for normalization
# col 10: std for z-scoring for normalization
# -1 indicates mean subtraction
attrinfo = matrix(0, rows=ncol(X), cols=10)
if(cmdLine_missing_value_maps != " "){
missing_indicator_mat = original_X[,(last_data_col+1):ncol(original_X)]
parfor(i in 1:nrow(missing_val_maps), check=0){
attr_index_mv = as.scalar(missing_val_maps[i,1])
attrinfo[attr_index_mv,1] = i
attrinfo[attr_index_mv,2] = missing_val_maps[i,2]
}
}
if(cmdLine_bin_defns != " "){
bin_defns = read(cmdLine_bin_defns)
parfor(i in 1:nrow(bin_defns), check=0){
attr_index_bin = as.scalar(bin_defns[i,1])
attrinfo[attr_index_bin,3] = bin_defns[i,4]
attrinfo[attr_index_bin,4] = bin_defns[i,2]
attrinfo[attr_index_bin,5] = bin_defns[i,3]
}
}
if(cmdLine_dummy_code_maps != " "){
dummy_code_maps = read(cmdLine_dummy_code_maps)
parfor(i in 1:nrow(dummy_code_maps), check=0){
attr_index_dc = as.scalar(dummy_code_maps[i,1])
attrinfo[attr_index_dc,6] = dummy_code_maps[i,2]
attrinfo[attr_index_dc,7] = dummy_code_maps[i,3]
}
}else{
attrinfo[,6] = seq(1, ncol(X), 1)
attrinfo[,7] = seq(1, ncol(X), 1)
}
if(cmdLine_normalization_maps != " "){
normalization_map = read(cmdLine_normalization_maps)
parfor(i in 1:nrow(normalization_map), check=0){
attr_index_normalization = as.scalar(normalization_map[i,1])
attrinfo[attr_index_normalization,8] = 1
attrinfo[attr_index_normalization,9] = as.scalar(normalization_map[i,2])
attrinfo[attr_index_normalization,10] = as.scalar(normalization_map[i,3])
}
}
#write(attrinfo, "binning/attrinfo.mtx", format="csv")
cols_in_transformed_X = as.scalar(attrinfo[nrow(attrinfo),6])
new_X = matrix(0, rows=nrow(X), cols=cols_in_transformed_X)
log = matrix(0, rows=ncol(X), cols=2)
parfor(i in 1:ncol(X), check=0){
col = X[,i]
mv_col_id = as.scalar(attrinfo[i,1])
global_mean = as.scalar(attrinfo[i,2])
num_bins = as.scalar(attrinfo[i,3])
bin_width = as.scalar(attrinfo[i,4])
min_val = as.scalar(attrinfo[i,5])
dummy_coding_beg_col = as.scalar(attrinfo[i,6])
dummy_coding_end_col = as.scalar(attrinfo[i,7])
normalization_needed = as.scalar(attrinfo[i,8])
normalization_mean = as.scalar(attrinfo[i,9])
normalization_std = as.scalar(attrinfo[i,10])
if(mv_col_id > 0){
# fill-in with global mean
col = col + missing_indicator_mat[,mv_col_id] * global_mean
}
if(num_bins > 0){
# only for equiwidth bins
# note that max_val entries will get assigned num_bins+1
col = round((col - min_val)/bin_width - 0.5) + 1
less_than_lb = (col < 1)
more_than_ub = (col > num_bins)
col = (1 - less_than_lb - more_than_ub)*col + more_than_ub*num_bins + less_than_lb
}
if(dummy_coding_beg_col == dummy_coding_end_col){
if(normalization_needed == 1){
if(normalization_std == -1) col = col - normalization_mean
else col = (col - normalization_mean)/normalization_std
}
new_X[,dummy_coding_beg_col] = col
}else{
min_val = min(col)
max_val = max(col)
if(min_val >= 1 & max_val <= dummy_coding_end_col - dummy_coding_beg_col + 1){
res = table(seq(1, nrow(X), 1), col, nrow(X), (dummy_coding_end_col-dummy_coding_beg_col+1))
new_X[,dummy_coding_beg_col:dummy_coding_end_col] = res
}else{
log[i,1] = 1
if(min_val < 1) log[i,2] = min_val
else log[i,2] = max_val
}
}
}
write(new_X, $transformed_X, format="text")
s = "Warning Messages"
for(i in 1:nrow(log)){
if(as.scalar(log[i,1]) == 1)
s = append(s, "Unseen value in column " + i + " (" + as.scalar(log[i,2]) + ")")
}
write(s, $Log)