| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Built-in function Implements Multiple Imputation using Chained Equations (MICE) |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Data Matrix (Recoded Matrix for categorical features) |
| # cMask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1) |
| # iter Integer 3 Number of iteration for multiple imputations |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # output Double --- imputed dataset |
| |
| |
| # Assumption missing value are represented with empty string i.e ",," in CSV file |
| # variables with suffix n are storing continuos/numeric data and variables with suffix c are storing categorical data |
| m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3, Boolean verbose = FALSE) |
| return(Matrix[Double] output) |
| { |
| lastIndex = ncol(X); |
| sumMax = sum(cMask); |
| |
| # if all features are numeric add a categorical features |
| # if all features are categorical add a numeric features |
| if(sumMax == 0 | sumMax == ncol(cMask)) { |
| X = cbind(X, matrix(1, nrow(X), 1)) |
| cMask = cbind(cMask, matrix(ifelse(sumMax==0, 1, 0), 1, 1)) |
| } |
| |
| # separate categorical and continuous features |
| nX = removeEmpty(target=X, margin="cols", select=(cMask==0)) |
| cX = removeEmpty(target=X, margin="cols", select= cMask) |
| |
| # store the mask of numeric missing values |
| Mask_n = is.na(nX); |
| nX = replace(target=nX, pattern=NaN, replacement=0); |
| # initial mean imputation |
| X_n = nX+(Mask_n*colMeans(nX)) |
| |
| # store the mask of categorical missing values |
| Mask_c = is.na(cX); |
| cX = replace(target=cX, pattern=NaN, replacement=0); |
| colMode = colMode(cX) |
| # initial mode imputation |
| X_c = cX+(Mask_c*colMode) |
| |
| # reconstruct original matrix using sparse matrices p and q |
| p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows", select=t(cMask==0)), ncol(nX), ncol(X)) |
| q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows", select=t(cMask)), ncol(cX), ncol(X)) |
| X1 = (X_n %*% p) + (X_c %*% q) |
| Mask1 = is.na(X) |
| |
| X = replace(target=X, pattern=NaN, replacement=0); |
| d = ncol(X1) |
| n = nrow(X1) |
| |
| # compute index of categorical features |
| encodeIndex = removeEmpty(target=t(seq(1, ncol(X1))), margin="cols", select=cMask) |
| |
| s = ""; |
| for(i in 1:ncol(encodeIndex)) |
| s = s + as.integer(as.scalar(encodeIndex[1, i])) + ","; |
| |
| # specifications for one-hot encoding of categorical features |
| jspecDC = "{ids:true, dummycode:["+s+"]}"; |
| |
| for(k in 1:iter) # start iterative imputation |
| { |
| Mask_Filled = Mask1 |
| inverseMask = Mask1 == 0 |
| # OHE of categorical features |
| [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC); |
| dist = (colMaxs(X1) * cMask) + (cMask == 0) # number of distinct items in categorical features |
| i=1; j=1; in_c=1; |
| |
| while(i < ncol(dX)) |
| { |
| j = (i + as.scalar(dist[1,in_c])) - 1 # index value for iterating OHE columns |
| if(sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) == 0) # impute numeric features |
| { |
| # construct column selector |
| selX = matrix(1,1,ncol(dX)) |
| selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c])) |
| selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c]) |
| slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = selX); |
| train_Y = slice1a[,in_c] |
| |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c]) |
| slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = selX); |
| test_Y = slice2a[,in_c] |
| |
| # learn a regression line |
| beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7); |
| # predicting missing values |
| pred = lmpredict(X=test_X, w=beta, icpt=1) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,nrow(X1)), margin="rows"); |
| # TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) |
| Mask_Filled[,in_c] = table(R, 1, pred, nrow(X1), 1); |
| } |
| else if (sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) != 0) # impute categorical features |
| { |
| # construct column selector |
| selX = matrix(1,1,ncol(dX)) |
| selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c])) |
| selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c]) |
| slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = selX); |
| train_Y = slice1a[,in_c] |
| |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c]) |
| slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = selX); |
| test_Y = slice2a[,in_c] |
| |
| # train classification model |
| beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, reg = 0.001, maxi = 100, maxii=0, verbose=FALSE) |
| # predicting missing values |
| [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled[,in_c] * seq(1,n), margin="rows"); |
| #TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) |
| Mask_Filled[,in_c] = table(R, 1, pred, n, 1); |
| } |
| i = as.integer(j)+1 |
| in_c = in_c + 1 |
| } |
| X1 = X + Mask_Filled |
| } |
| output = X1[,1:lastIndex] |
| } |
| |
| colMode = function (Matrix[Double] X) return (Matrix[Double] colMode) { |
| d = ncol(X) |
| n = nrow(X) |
| colMode = matrix(0, 1, ncol(X)) |
| # compute column wise mode |
| parfor(i in 1: d) { |
| X_c = removeEmpty(target=X, margin = "rows", select=(rowSums(X != 0)==d)) |
| cat_counts = table(X_c[, i], 1, n, 1); # counts for each category |
| colMode[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode |
| } |
| } |