blob: a06a357a3d53a15dc6b72d328acfc398187c758b [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Builtin function Implements Multiple Imputation using Chained Equations (MICE) for nominal data
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# F String --- Data Frame
# cMask Double --- A 0/1 row vector for identifying numeric (0) adn categorical features (1)
# iter Integer 3 Number of iteration for multiple imputations
# complete Integer 3 A complete dataset generated though a specific iteration
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# dataset Double --- imputed dataset
# singleSet Double --- A complete dataset generated though a specific iteration
# Assumption missing value are represented with empty string i.e ",," in csv file
# variables with suffix n are storing continuous/numeric data and variables with suffix c are storing categorical data
s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, Integer complete = 3, Boolean verbose = FALSE)
return(Frame[String] dataset, Frame[String] singleSet)
{
if(ncol(F) == 1)
stop("invalid argument: can not apply mice on single column")
if(complete > iter)
complete = iter
# adding a temporary feature (in-case all attributes are of same type)
F = cbind(F, as.frame(matrix(1,nrow(F), 1)))
cMask = cbind(cMask, matrix(1,1,1))
n = nrow(F)
row = n*complete;
col = ncol(F)
Result = matrix(0, rows=1, cols = col)
Mask_Result = matrix(0, rows=1, cols=col)
scat = seq(1, ncol(cMask))
cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
if(nrow(cat) == ncol(F))
cMask[1,ncol(cMask)] = 0
s=""
for(i in 1: nrow(cat), check =0)
s = s+as.integer(as.scalar(cat[i, 1]))+",";
# encoding categorical columns using recode transformation
jspecR = "{ids:true, recode:["+s+"]}";
[X, M] = transformencode(target=F, spec=jspecR);
XO = replace(target=X, pattern=NaN, replacement=0);
# remove categorical features and impute continuous features with mean
eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0))
col_n = ncol(eX_n);
# storing the mask/address of missing values
Mask_n = is.na(eX_n);
inverseMask_n = 1 - Mask_n;
# replacing the empty cells in encoded data with 0
eX_n = replace(target=eX_n, pattern=NaN, replacement=0);
# filling the missing data with their means
X2_n = eX_n+(Mask_n*colMeans(eX_n))
# matrices for computing actul data
p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", select=t(cMask==0)))
if(ncol(p_n) < ncol(cMask))
p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n)))
q = XO * cMask
# Taking out the categorical features for initial imputation by mode
eX_c = removeEmpty(target = q, margin = "cols")
col_c = ncol(eX_c);
eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c != 0)==col_c))
colMod = matrix(0, 1, ncol(eX_c))
# compute columnwise mode
parfor(i in 1: col_c) {
f = eX_c2[, i] # adding one in data for dealing with zero category
cat_counts = table(f, 1, n, 1); # counts for each category
mode = as.scalar(rowIndexMax(t(cat_counts)));
colMod[1,i] = mode
}
# find the mask of missing values
tmpMask_c = (eX_c==0) * colMod # fill missing values with mode
# Generate a matrix of actual length
p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin ="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask))
Mask_c = tmpMask_c %*% p_c
inverseMask_c = Mask_c == 0
r = X2_n %*% p_n
qr = q + r
X2_c = qr + Mask_c
Mask_c = Mask_c != 0
# one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+s+"]}";
[dX, dM] = transformencode(target=as.frame(X2_c), spec=jspecDC);
# recoding of metadata of OHE features to get the number of distinct elements
[metaTransform, metaTransformMeta] = transformencode(target=dM, spec=jspecR);
metaTransform = replace(target=metaTransform, pattern=NaN, replacement=0)
# counting distinct elements in each categorical feature
dcDistincts = colMaxs(metaTransform)
dist = dcDistincts + (1-cMask)
# creating a mask matrix of OHE features
dXMask = matrix(0, 1, ncol(dX))
index = 1
for(k in 1:col) {
nDistk = as.scalar(dcDistincts[1,k]);
if(nDistk != 0) {
dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk)
index += nDistk;
}
else
index += 1
}
#multiple imputations
for(k in 1:iter)
{
Mask_Filled_n = Mask_n;
Mask_Filled_c = Mask_c
in_n = 1; in_c = 1; i=1; j=1; # variables for index selection
while(i <= ncol(dX))
{
if(as.scalar(dXMask[1,i]) == 0)
{
# construct column selector
sel = cbind(matrix(1,1,i-1), as.matrix(0), matrix(1,1,ncol(dX)-i));
# prepare train data set X and Y
slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask_n[,in_n])
train_X = removeEmpty(target = slice1, margin = "cols", select = sel);
train_Y = slice1[,i]
# prepare score data set X and Y for imputing Y
slice2 = removeEmpty(target = dX, margin = "rows", select = Mask_n[,in_n])
test_X = removeEmpty(target = slice2, margin = "cols", select = sel);
test_Y = slice2[,i]
# learning a regression line
beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7);
# predicting missing values
pred = lmpredict(X=test_X, w=beta, icpt=1)
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
R = removeEmpty(target=Mask_Filled_n[,in_n] * seq(1,n), margin="rows");
#TODO modify removeEmpty to return zero row and n columns
if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0)))
Mask_Filled_n[,in_n] = table(R, 1, pred, n, 1);
in_n = in_n + 1;
}
if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) )
{
j = (i + as.scalar(dist[1,in_c])) - 1
# construct column selector
selX = matrix(1,1,ncol(dX))
selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,col-in_c));
# prepare train data set X and Y
slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask_c[,in_c])
slice1a = removeEmpty(target = X2_c, margin = "rows", select = inverseMask_c[,in_c])
train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
train_Y = slice1a[,in_c]
# prepare score data set X and Y for imputing Y
slice2 = removeEmpty(target = dX, margin = "rows", select = Mask_c[,in_c])
slice2a = removeEmpty(target = X2_c, margin = "rows", select = Mask_c[,in_c])
test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
test_Y = slice2a[,in_c]
# train classification model
beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, reg = 0.001, maxi = 100, maxii=0, verbose=FALSE)
# predicting missing values
[prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
R = removeEmpty(target=Mask_Filled_c[,in_c] * seq(1,n), margin="rows");
#TODO modify removeEmpty to return zero row and n columns
if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0)))
Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1);
i = as.integer(j)
}
if(in_c < col)
in_c = in_c + 1
i = i+1;
}
nM = ((Mask_Filled_n) %*% p_n) + Mask_Filled_c
Result = rbind(Result, nM+XO)
Mask_Result = rbind(Mask_Result, nM)
[dX, dM] = transformencode(target=as.frame(nM+XO), spec=jspecDC);
}
# compute output indices
Result = Result[2: n*iter+1, ]
Mask_Result = Mask_Result[2: n*iter+1, ]
index = (((complete*n)-n)+1)
# voting for aggregation of categorical imputations
agg = cAggregate(Mask_Result*cMask, iter, n)
# aggregating the results
Agg_Matrix = matrix(0,n, col)
for(d in 1:iter)
Agg_Matrix = Agg_Matrix + Mask_Result[(((d-1)*n)+1):(n*d),]
Agg_Matrix = (Agg_Matrix/iter)
Agg_Matrix = Agg_Matrix * (cMask == 0)
Agg_Matrix = Agg_Matrix + agg
dataset = XO + Agg_Matrix
singleSet = Result[index:row, ]
# decoding nominal columns
dataset = transformdecode(target=dataset, spec=jspecR, meta=M);
singleSet = transformdecode(target=singleSet, spec=jspecR, meta=M);
# removing extra categorical column
dataset = dataset[,1:col-1]
singleSet = singleSet[,1:col-1]
}
cAggregate = function(Matrix[Double] Mask_Result, Integer iter, Integer n)
return (Matrix[Double] agg)
{
conflict = matrix(0, n, ncol(Mask_Result))
uCount = 0
vCount = 0
for(d in seq(1,(iter-1), 1))
{
u = Mask_Result[(((d-1)*n)+1):(n*d),]
v = Mask_Result[(((d)*n)+1):(n*(d+1)),]
if(sum(u != v) > 0) {
conflict = u != v
u1 = conflict * u;
v1 = conflict * v;
for(i in 1: iter)
{
s = Mask_Result[(((i-1)*n)+1):(n*i),]
s = s * conflict
if(sum(u1 != s ) == 0)
uCount = uCount + 1
if(sum(v1 != s) == 0)
vCount = vCount + 1
}
# copy the results of u in v
if(uCount > vCount)
Mask_Result[(((d)*n)+1):(n*(d+1)),] = Mask_Result[(((d-1)*n)+1):(n*d),]
# copy the results of v in u
else
Mask_Result[(((d-1)*n)+1):(n*d),] = Mask_Result[(((d)*n)+1):(n*(d+1)),]
d = 1
}
}
agg = Mask_Result[1:n,]
}