| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This Builtin function implements multiple imputation using Chained Equations (MICE) |
| # |
| # Assumption missing value are represented with empty string i.e ",," in CSV file |
| # variables with suffix n are storing continuos/numeric data and variables with |
| # suffix c are storing categorical data |
| # |
| # INPUT: |
| # ------------------------------------------------------------------------------------ |
| # X Data Matrix (Recoded Matrix for categorical features) |
| # cMask A 0/1 row vector for identifying numeric (0) and categorical features (1) |
| # iter Number of iteration for multiple imputations |
| # threshold confidence value [0, 1] for robust imputation, values will only be imputed |
| # if the predicted value has probability greater than threshold, |
| # only applicable for categorical data |
| # verbose Boolean value. |
| # ------------------------------------------------------------------------------------ |
| # |
| # OUTPUT: |
| # --------------------------------------------------------------------------------- |
| # output imputed dataset |
| # --------------------------------------------------------------------------------- |
| |
| m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3, |
| Double threshold = 0.8, Boolean verbose = FALSE) |
| return(Matrix[Double] output, Matrix[Double] meta, Double threshold, Frame[String] dM, List[Unknown] betaList) |
| { |
| if(ncol(X) < 2) |
| stop("MICE can not be applied on single vectors. |
| expected number of columns > 1 found: "+ncol(X)) |
| |
| if(ncol(cMask) != ncol(X)) |
| stop("MICE Dimension mismatch: the columns in X != columns in mask") |
| |
| |
| lastIndex = ncol(X); |
| sumMax = sum(cMask); |
| |
| # if all features are numeric add a categorical features |
| # if all features are categorical add a numeric features |
| if(sumMax == 0 | sumMax == ncol(cMask)) { |
| X = cbind(X, matrix(1, nrow(X), 1)) |
| cMask = cbind(cMask, matrix(ifelse(sumMax==0, 1, 0), 1, 1)) |
| } |
| |
| # impute by mean |
| Mask1 = is.na(X) |
| meta = rbind(cMask, (colSums(Mask1) > 0)) |
| X = replace(target=X, pattern=NaN, replacement=0); |
| imputationVec = getInitialImputation(X, cMask) |
| X1 = X + (Mask1 * imputationVec) |
| d = ncol(X1) |
| n = nrow(X1) |
| # compute index of categorical features |
| index = vectorToCsv(cMask) |
| # specifications for one-hot encoding of categorical features |
| jspecDC = "{ids:true, dummycode:["+index+"]}"; |
| [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC); |
| dist = colDist(X1, cMask) # number of distinct items in categorical features |
| for(k in 1:iter) # start iterative imputation |
| { |
| betaList = list() |
| betaList = append(betaList, imputationVec) |
| Mask_Filled = Mask1 # use this to store predictions for missing values |
| weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold |
| inverseMask = Mask1 == 0 |
| meta = rbind(meta, dist) |
| i=1; j=1; in_c=1; |
| |
| while(i < ncol(dX)) |
| { |
| j = (i + as.scalar(dist[1,in_c])) - 1 # index value for iterating OHE columns |
| beta = as.matrix(0) |
| if(sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) == 0) # impute numeric features |
| { |
| # construct column selector |
| selX = matrix(1,1,ncol(dX)) |
| selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c])) |
| selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c]) |
| slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = selX); |
| train_Y = slice1a[,in_c] |
| |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c]) |
| slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = selX); |
| test_Y = slice2a[,in_c] |
| |
| # learn a regression line |
| beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7); |
| # predicting missing values |
| pred = lmPredict(X=test_X, B=beta, ytest= matrix(0,1,1), icpt=1, verbose = FALSE) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,n), margin="rows"); |
| # TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) |
| Mask_Filled[,in_c] = table(R, 1, pred, n, 1); |
| } |
| else if (sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) != 0) # impute categorical features |
| { |
| # construct column selector |
| selX = matrix(1,1,ncol(dX)) |
| selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c])) |
| selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c]) |
| slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = selX); |
| train_Y = slice1a[,in_c] |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c]) |
| slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = selX); |
| test_Y = slice2a[,in_c] |
| # train classification model |
| if(min(train_Y) == max(train_Y)) { # if the train_Y has only one class then do not train |
| pred = matrix(min(train_Y), nrow(test_Y), 1) |
| prob = matrix(1, nrow(test_Y), 1) |
| } |
| else { |
| beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg = 0.00001, |
| maxi = 20, maxii=20, verbose=FALSE) |
| # predicting missing values |
| [prob, pred, acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y) |
| prob = rowMaxs(prob) |
| } |
| validThreshold = prob > threshold |
| pred = (pred * validThreshold) + (test_Y * (validThreshold == 0)) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled[,in_c] * seq(1,n), margin="rows"); |
| wR = removeEmpty(target=weightMatrix[, in_c] * seq(1,n), margin="rows"); |
| #TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) { |
| Mask_Filled[,in_c] = table(R, 1, pred, n, 1); |
| weightMatrix[, in_c] = table(wR, 1, prob, n, 1) |
| } |
| } |
| i = as.integer(j)+1 |
| in_c = in_c + 1 |
| betaList = append(betaList, beta) |
| } |
| X1 = X + Mask_Filled |
| # OHE of categorical features |
| dX = transformapply(target=as.frame(X1), spec=jspecDC, meta=dM); |
| } |
| # Finalize the predictions, if the weight for some predictions is less than threshold than do not fill-in |
| # leave the values as NaN as we do not have enough confidence about the prediction |
| invalidImputations = (weightMatrix < threshold) & (weightMatrix > 0) |
| makeNas = replace(target = invalidImputations, pattern = 1, replacement = NaN) |
| X1 = X1 + makeNas |
| output = X1[,1:lastIndex] |
| } |
| |
| |
| colDist= function(Matrix[Double] X, Matrix[Double] mask) |
| return (Matrix[Double] dist){ |
| |
| dist = matrix(1, 1, ncol(X)) |
| X = replace(target=X, pattern=0, replacement=max(X)+1) |
| for(i in 1:ncol(X)) |
| { |
| if(as.scalar(mask[,i]) == 1) |
| { |
| distT = table(X[, i], 1) |
| dist[1, i] = sum(distT != 0) |
| } |
| } |
| } |
| |
| getInitialImputation = function(Matrix[Double] X, Matrix[Double] mask) |
| return(Matrix[Double] imputationVec) |
| { |
| meanVec = matrix(0, rows=1, cols=ncol(X)) |
| for(i in 1:ncol(X)) |
| { |
| if(as.scalar(mask[, i]) == 0) |
| { |
| Xcol = removeEmpty(target = X[, i], margin="rows", select = (is.na(X[, i]) == 0)) |
| meanVec[1, i] = mean(Xcol) |
| } |
| } |
| cX = X*mask |
| [X_c, colMode] = imputeByMode(cX) |
| imputationVec = meanVec + colMode |
| } |