| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Builtin function Implements Multiple Imputation using Chained Equations (MICE) for nominal data |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # F String --- Data Frame |
| # cMask Double --- A 0/1 row vector for identifying numeric (0) adn categorical features (1) |
| # iter Integer 3 Number of iteration for multiple imputations |
| # complete Integer 3 A complete dataset generated though a specific iteration |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # dataset Double --- imputed dataset |
| # singleSet Double --- A complete dataset generated though a specific iteration |
| |
| # Assumption missing value are represented with empty string i.e ",," in csv file |
| # variables with suffix n are storing continuous/numeric data and variables with suffix c are storing categorical data |
| s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, Integer complete = 3, Boolean verbose = FALSE) |
| return(Frame[String] dataset, Frame[String] singleSet) |
| { |
| |
| if(ncol(F) == 1) |
| stop("invalid argument: can not apply mice on single column") |
| |
| if(complete > iter) |
| complete = iter |
| |
| |
| # adding a temporary feature (in-case all attributes are of same type) |
| F = cbind(F, as.frame(matrix(1,nrow(F), 1))) |
| cMask = cbind(cMask, matrix(1,1,1)) |
| |
| n = nrow(F) |
| row = n*complete; |
| col = ncol(F) |
| Result = matrix(0, rows=1, cols = col) |
| Mask_Result = matrix(0, rows=1, cols=col) |
| scat = seq(1, ncol(cMask)) |
| cat = removeEmpty(target=scat, margin="rows", select=t(cMask)) |
| |
| if(nrow(cat) == ncol(F)) |
| cMask[1,ncol(cMask)] = 0 |
| |
| s="" |
| for(i in 1: nrow(cat), check =0) |
| s = s+as.integer(as.scalar(cat[i, 1]))+","; |
| |
| |
| # encoding categorical columns using recode transformation |
| jspecR = "{ids:true, recode:["+s+"]}"; |
| [X, M] = transformencode(target=F, spec=jspecR); |
| |
| XO = replace(target=X, pattern=NaN, replacement=0); |
| |
| # remove categorical features and impute continuous features with mean |
| eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0)) |
| col_n = ncol(eX_n); |
| # storing the mask/address of missing values |
| Mask_n = is.na(eX_n); |
| inverseMask_n = 1 - Mask_n; |
| # replacing the empty cells in encoded data with 0 |
| eX_n = replace(target=eX_n, pattern=NaN, replacement=0); |
| # filling the missing data with their means |
| X2_n = eX_n+(Mask_n*colMeans(eX_n)) |
| # matrices for computing actul data |
| p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", select=t(cMask==0))) |
| if(ncol(p_n) < ncol(cMask)) |
| p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n))) |
| q = XO * cMask |
| |
| # Taking out the categorical features for initial imputation by mode |
| eX_c = removeEmpty(target = q, margin = "cols") |
| col_c = ncol(eX_c); |
| eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c != 0)==col_c)) |
| colMod = matrix(0, 1, ncol(eX_c)) |
| # compute columnwise mode |
| parfor(i in 1: col_c) { |
| f = eX_c2[, i] # adding one in data for dealing with zero category |
| cat_counts = table(f, 1, n, 1); # counts for each category |
| mode = as.scalar(rowIndexMax(t(cat_counts))); |
| colMod[1,i] = mode |
| } |
| |
| # find the mask of missing values |
| tmpMask_c = (eX_c==0) * colMod # fill missing values with mode |
| |
| # Generate a matrix of actual length |
| p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin ="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask)) |
| |
| Mask_c = tmpMask_c %*% p_c |
| inverseMask_c = Mask_c == 0 |
| r = X2_n %*% p_n |
| qr = q + r |
| X2_c = qr + Mask_c |
| Mask_c = Mask_c != 0 |
| |
| |
| # one-hot encoding of categorical features |
| jspecDC = "{ids:true, dummycode:["+s+"]}"; |
| [dX, dM] = transformencode(target=as.frame(X2_c), spec=jspecDC); |
| |
| # recoding of metadata of OHE features to get the number of distinct elements |
| [metaTransform, metaTransformMeta] = transformencode(target=dM, spec=jspecR); |
| metaTransform = replace(target=metaTransform, pattern=NaN, replacement=0) |
| # counting distinct elements in each categorical feature |
| dcDistincts = colMaxs(metaTransform) |
| dist = dcDistincts + (1-cMask) |
| |
| # creating a mask matrix of OHE features |
| dXMask = matrix(0, 1, ncol(dX)) |
| index = 1 |
| for(k in 1:col) { |
| nDistk = as.scalar(dcDistincts[1,k]); |
| if(nDistk != 0) { |
| dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk) |
| index += nDistk; |
| } |
| else |
| index += 1 |
| } |
| |
| #multiple imputations |
| for(k in 1:iter) |
| { |
| Mask_Filled_n = Mask_n; |
| Mask_Filled_c = Mask_c |
| in_n = 1; in_c = 1; i=1; j=1; # variables for index selection |
| while(i <= ncol(dX)) |
| { |
| if(as.scalar(dXMask[1,i]) == 0) |
| { |
| # construct column selector |
| sel = cbind(matrix(1,1,i-1), as.matrix(0), matrix(1,1,ncol(dX)-i)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask_n[,in_n]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = sel); |
| train_Y = slice1[,i] |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask_n[,in_n]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = sel); |
| test_Y = slice2[,i] |
| # learning a regression line |
| beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7); |
| # predicting missing values |
| pred = lmpredict(X=test_X, w=beta, icpt=1) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled_n[,in_n] * seq(1,n), margin="rows"); |
| #TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) |
| Mask_Filled_n[,in_n] = table(R, 1, pred, n, 1); |
| in_n = in_n + 1; |
| } |
| |
| if( (as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0) ) |
| { |
| j = (i + as.scalar(dist[1,in_c])) - 1 |
| |
| # construct column selector |
| selX = matrix(1,1,ncol(dX)) |
| selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c])) |
| selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,col-in_c)); |
| # prepare train data set X and Y |
| slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask_c[,in_c]) |
| slice1a = removeEmpty(target = X2_c, margin = "rows", select = inverseMask_c[,in_c]) |
| train_X = removeEmpty(target = slice1, margin = "cols", select = selX); |
| train_Y = slice1a[,in_c] |
| |
| # prepare score data set X and Y for imputing Y |
| slice2 = removeEmpty(target = dX, margin = "rows", select = Mask_c[,in_c]) |
| slice2a = removeEmpty(target = X2_c, margin = "rows", select = Mask_c[,in_c]) |
| test_X = removeEmpty(target = slice2, margin = "cols", select = selX); |
| test_Y = slice2a[,in_c] |
| |
| # train classification model |
| beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, reg = 0.001, maxi = 100, maxii=0, verbose=FALSE) |
| # predicting missing values |
| [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y) |
| # imputing missing column values (assumes Mask_Filled being 0/1-matrix) |
| R = removeEmpty(target=Mask_Filled_c[,in_c] * seq(1,n), margin="rows"); |
| #TODO modify removeEmpty to return zero row and n columns |
| if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) |
| Mask_Filled_c[,in_c] = table(R, 1, pred, n, 1); |
| i = as.integer(j) |
| } |
| if(in_c < col) |
| in_c = in_c + 1 |
| i = i+1; |
| } |
| |
| nM = ((Mask_Filled_n) %*% p_n) + Mask_Filled_c |
| Result = rbind(Result, nM+XO) |
| Mask_Result = rbind(Mask_Result, nM) |
| [dX, dM] = transformencode(target=as.frame(nM+XO), spec=jspecDC); |
| } |
| |
| # compute output indices |
| Result = Result[2: n*iter+1, ] |
| Mask_Result = Mask_Result[2: n*iter+1, ] |
| index = (((complete*n)-n)+1) |
| # voting for aggregation of categorical imputations |
| agg = cAggregate(Mask_Result*cMask, iter, n) |
| |
| # aggregating the results |
| Agg_Matrix = matrix(0,n, col) |
| for(d in 1:iter) |
| Agg_Matrix = Agg_Matrix + Mask_Result[(((d-1)*n)+1):(n*d),] |
| Agg_Matrix = (Agg_Matrix/iter) |
| |
| Agg_Matrix = Agg_Matrix * (cMask == 0) |
| Agg_Matrix = Agg_Matrix + agg |
| |
| dataset = XO + Agg_Matrix |
| singleSet = Result[index:row, ] |
| |
| # decoding nominal columns |
| dataset = transformdecode(target=dataset, spec=jspecR, meta=M); |
| singleSet = transformdecode(target=singleSet, spec=jspecR, meta=M); |
| |
| # removing extra categorical column |
| dataset = dataset[,1:col-1] |
| singleSet = singleSet[,1:col-1] |
| } |
| |
| |
| cAggregate = function(Matrix[Double] Mask_Result, Integer iter, Integer n) |
| return (Matrix[Double] agg) |
| { |
| conflict = matrix(0, n, ncol(Mask_Result)) |
| uCount = 0 |
| vCount = 0 |
| for(d in seq(1,(iter-1), 1)) |
| { |
| u = Mask_Result[(((d-1)*n)+1):(n*d),] |
| v = Mask_Result[(((d)*n)+1):(n*(d+1)),] |
| if(sum(u != v) > 0) { |
| conflict = u != v |
| u1 = conflict * u; |
| v1 = conflict * v; |
| for(i in 1: iter) |
| { |
| s = Mask_Result[(((i-1)*n)+1):(n*i),] |
| s = s * conflict |
| if(sum(u1 != s ) == 0) |
| uCount = uCount + 1 |
| if(sum(v1 != s) == 0) |
| vCount = vCount + 1 |
| } |
| # copy the results of u in v |
| if(uCount > vCount) |
| Mask_Result[(((d)*n)+1):(n*(d+1)),] = Mask_Result[(((d-1)*n)+1):(n*d),] |
| # copy the results of v in u |
| else |
| Mask_Result[(((d-1)*n)+1):(n*d),] = Mask_Result[(((d)*n)+1):(n*(d+1)),] |
| d = 1 |
| } |
| } |
| agg = Mask_Result[1:n,] |
| } |