blob: 393ce3f3914cac5b9b4d0fb34542e67355812648 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# read data frame
F = read($X, data_type="frame", format="csv");
# the mask for identifying categorical columns
Mask = read($Mask)
# Test cases
# case 1: if all columns are categorical
if(sum(Mask) == ncol(F))
{
scat = seq(1, ncol(Mask))
s = "1";
for(i in 2:ncol(F))
s = s + "," + i;
# encoding categorical columns using recode transformation
jspecR = "{ids:true, recode:["+s+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
# call mice
dataset = mice(X=X,cMask=Mask, iter=$iteration, verbose = FALSE )
# decode data back to original format
output = as.matrix(transformdecode(target=dataset, spec=jspecR, meta=M));
# cherry picking columns to compare with R results
output = output[, 3:4]
write(output, $dataC)
}
# case 2: if all data is numeric
else if(sum(Mask) == 0){
# no transformation is required, cast the frame into matrix and call mice
# as.matrix() will convert the null values into zeros, so explicitly replace zeros with NaN
X = replace(target = as.matrix(F), pattern = 0, replacement = NaN)
output = mice(X=X, cMask=Mask, iter=$iteration, verbose = FALSE )
write(output, $dataN)
}
# case 3: if the data is combination of numeric and categorical columns
else
{
scat = seq(1, ncol(Mask))
cat = removeEmpty(target=scat, margin="rows", select=t(Mask))
s = "" + as.integer(as.scalar(cat[1, 1]))
for(i in 2:nrow(cat))
s = s + "," + as.integer(as.scalar(cat[i, 1]));
# encoding categorical columns using recode transformation
jspecR = "{ids:true, recode:["+s+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
# call mice
dataset = mice(X=X,cMask=Mask, iter=$iteration, verbose = FALSE )
# decode data into original format
output = as.matrix(transformdecode(target=dataset, spec=jspecR, meta=M));
# below lines are only for testing purpose
c = output * (Mask)
c = removeEmpty(target=c, margin = "cols")
n = output * (1-Mask)
n = removeEmpty(target=n, margin = "cols")
write(n, $dataN)
write(c, $dataC)
}