blob: f5a373f4866c67372d11cd27d49fb7596a4e9996 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Built-in function Implements Multiple Imputation using Chained Equations (MICE)
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Data Matrix (Recoded Matrix for categorical features)
# cMask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
# iter Integer 3 Number of iteration for multiple imputations
# threshold Double 0.8 confidence value [0, 1] for robust imputation, values will only be imputed
# if the predicted value has probability greater than threshold,
# only applicable for categorical data
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# output Double --- imputed dataset
# Assumption missing value are represented with empty string i.e ",," in CSV file
# variables with suffix n are storing continuos/numeric data and variables with
# suffix c are storing categorical data
m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
Double threshold = 0.8, Boolean verbose = FALSE)
return(Matrix[Double] output)
{
if(ncol(X) < 2)
stop("MICE can not be applied on single vectors.
expected number of columns > 1 found: "+ncol(X))
if(ncol(cMask) != ncol(X))
stop("MICE Dimension mismatch: the columns in X != columns in mask")
lastIndex = ncol(X);
sumMax = sum(cMask);
# if all features are numeric add a categorical features
# if all features are categorical add a numeric features
if(sumMax == 0 | sumMax == ncol(cMask)) {
X = cbind(X, matrix(1, nrow(X), 1))
cMask = cbind(cMask, matrix(ifelse(sumMax==0, 1, 0), 1, 1))
}
# separate categorical and continuous features
nX = removeEmpty(target=X, margin="cols", select=(cMask==0))
cX = removeEmpty(target=X, margin="cols", select= cMask)
# store the mask of numeric missing values
Mask_n = is.na(nX);
nX = replace(target=nX, pattern=NaN, replacement=0);
# initial mean imputation
X_n = nX+(Mask_n*colMeans(nX))
# store the mask of categorical missing values
Mask_c = is.na(cX);
X_c = imputeByMode(cX)
# initial mode imputation
# reconstruct original matrix using sparse matrices p and q
p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
select=t(cMask==0)), ncol(nX), ncol(X))
q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
select=t(cMask)), ncol(cX), ncol(X))
X1 = (X_n %*% p) + (X_c %*% q)
Mask1 = is.na(X)
X = replace(target=X, pattern=NaN, replacement=0);
d = ncol(X1)
n = nrow(X1)
# compute index of categorical features
encodeIndex = removeEmpty(target=t(seq(1, ncol(X1))), margin="cols", select=cMask)
s = "";
for(i in 1:ncol(encodeIndex))
s = s + as.integer(as.scalar(encodeIndex[1, i])) + ",";
# specifications for one-hot encoding of categorical features
jspecDC = "{ids:true, dummycode:["+s+"]}";
for(k in 1:iter) # start iterative imputation
{
Mask_Filled = Mask1 # use this to store predictions for missing values
weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold
inverseMask = Mask1 == 0
# OHE of categorical features
[dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
dist = colDist(X1, cMask) # number of distinct items in categorical features
i=1; j=1; in_c=1;
while(i < ncol(dX))
{
j = (i + as.scalar(dist[1,in_c])) - 1 # index value for iterating OHE columns
if(sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) == 0) # impute numeric features
{
# construct column selector
selX = matrix(1,1,ncol(dX))
selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
# prepare train data set X and Y
slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
train_Y = slice1a[,in_c]
# prepare score data set X and Y for imputing Y
slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
test_Y = slice2a[,in_c]
# learn a regression line
beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7);
# predicting missing values
pred = lmPredict(X=test_X, B=beta, ytest= matrix(0,1,1), icpt=1, verbose = FALSE)
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,n), margin="rows");
# TODO modify removeEmpty to return zero row and n columns
if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0)))
Mask_Filled[,in_c] = table(R, 1, pred, n, 1);
}
else if (sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) != 0) # impute categorical features
{
# construct column selector
selX = matrix(1,1,ncol(dX))
selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
# prepare train data set X and Y
slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
train_Y = slice1a[,in_c]
# prepare score data set X and Y for imputing Y
slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
test_Y = slice2a[,in_c]
# train classification model
if(min(train_Y) == max(train_Y)) { # if the train_Y has only one class then do not train
pred = matrix(min(train_Y), nrow(test_Y), 1)
prob = matrix(1, nrow(test_Y), 1)
}
else {
beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg = 0.00001,
maxi = 50, maxii=50, verbose=FALSE)
# predicting missing values
[prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
prob = rowMaxs(prob)
}
validThreshold = prob > threshold
pred = (pred * validThreshold) + (test_Y * (validThreshold == 0))
# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
R = removeEmpty(target=Mask_Filled[,in_c] * seq(1,n), margin="rows");
wR = removeEmpty(target=weightMatrix[, in_c] * seq(1,n), margin="rows");
#TODO modify removeEmpty to return zero row and n columns
if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) {
Mask_Filled[,in_c] = table(R, 1, pred, n, 1);
weightMatrix[, in_c] = table(wR, 1, prob, n, 1)
}
}
i = as.integer(j)+1
in_c = in_c + 1
}
X1 = X + Mask_Filled
}
# Finalize the predictions, if the weight for some predictions is less than threshold than do not fill-in
# leave the values as NaN as we do not have enough confidence about the prediction
invalidImputations = (weightMatrix < threshold) & (weightMatrix > 0)
makeNas = replace(target = invalidImputations, pattern = 1, replacement = NaN)
X1 = X1 + makeNas
output = X1[,1:lastIndex]
}
colDist= function(Matrix[Double] X, Matrix[Double] mask)
return (Matrix[Double] dist){
dist = matrix(1, 1, ncol(X))
X = replace(target=X, pattern=0, replacement=max(X)+1)
parfor(i in 1:ncol(X))
{
if(as.scalar(mask[,i]) == 1)
{
distT = table(X[, i], 1)
dist[1, i] = sum(distT != 0)
}
}
}