scripts/builtin/mice.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Built-in function Implements Multiple Imputation using Chained Equations (MICE)
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double    ---        Data Matrix (Recoded Matrix for categorical features)
 # cMask           Double    ---        A 0/1 row vector for identifying numeric (0) and categorical features (1)
 # iter            Integer    3         Number of iteration for multiple imputations
 # threshold       Double     0.8      confidence value [0, 1] for robust imputation, values will only be imputed
 #                                      if the predicted value has probability greater than threshold,
 #                                      only applicable for categorical data
 # ---------------------------------------------------------------------------------------------


 #Output(s)
 # ---------------------------------------------------------------------------------------------
 # NAME                  TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # output               Double   ---        imputed dataset


 # Assumption missing value are represented with empty string i.e ",," in CSV file
 # variables with suffix n are storing continuos/numeric data and variables with
 # suffix c are storing categorical data
 m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
   Double threshold = 0.8, Boolean verbose = FALSE)
   return(Matrix[Double] output)
 {

   if(ncol(X) < 2)
     stop("MICE can not be applied on single vectors.
          expected number of columns > 1 found: "+ncol(X))

   if(ncol(cMask) != ncol(X))
     stop("MICE Dimension mismatch: the columns in X != columns in mask")


   lastIndex = ncol(X);
   sumMax = sum(cMask);

   # if all features are numeric add a categorical features
   # if all features are categorical add a numeric features
   if(sumMax == 0 | sumMax == ncol(cMask)) {
     X = cbind(X, matrix(1, nrow(X), 1))
     cMask = cbind(cMask, matrix(ifelse(sumMax==0, 1, 0), 1, 1))
   }

   # separate categorical and continuous features
   nX = removeEmpty(target=X, margin="cols", select=(cMask==0))
   cX = removeEmpty(target=X, margin="cols", select= cMask)

   # store the mask of numeric missing values
   Mask_n = is.na(nX);
   nX = replace(target=nX, pattern=NaN, replacement=0);
   # initial mean imputation
   X_n = nX+(Mask_n*colMeans(nX))

   # store the mask of categorical missing values
   Mask_c = is.na(cX);
   X_c = imputeByMode(cX)
   # initial mode imputation

   # reconstruct original matrix using sparse matrices p and q
   p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
     select=t(cMask==0)), ncol(nX), ncol(X))
   q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
     select=t(cMask)), ncol(cX), ncol(X))
   X1 = (X_n %*% p) + (X_c %*% q)

   Mask1 =  is.na(X)

   X = replace(target=X, pattern=NaN, replacement=0);
   d = ncol(X1)
   n = nrow(X1)

   # compute index of categorical features
   encodeIndex = removeEmpty(target=t(seq(1, ncol(X1))), margin="cols", select=cMask)

   s = "";
   for(i in 1:ncol(encodeIndex))
     s = s + as.integer(as.scalar(encodeIndex[1, i])) + ",";

   # specifications for one-hot encoding of categorical features
   jspecDC = "{ids:true, dummycode:["+s+"]}";

   for(k in 1:iter) # start iterative imputation
   {
     Mask_Filled = Mask1 # use this to store predictions for missing values
     weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold
     inverseMask = Mask1 == 0
     # OHE of categorical features
     [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
     dist = colDist(X1, cMask) # number of distinct items in categorical features
     i=1; j=1; in_c=1;

     while(i < ncol(dX))
     {
       j = (i + as.scalar(dist[1,in_c])) - 1 # index value for iterating OHE columns

       if(sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) == 0) # impute numeric features
       {
         # construct column selector
         selX = matrix(1,1,ncol(dX))
         selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
         selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
         # prepare train data set X and Y
         slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
         slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
         train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
         train_Y = slice1a[,in_c]

         # prepare score data set X and Y for imputing Y
         slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
         slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
         test_X =  removeEmpty(target = slice2, margin = "cols", select = selX);
         test_Y = slice2a[,in_c]

         # learn a regression line
         beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7);
         # predicting missing values
         pred = lmPredict(X=test_X, B=beta, ytest= matrix(0,1,1), icpt=1, verbose = FALSE)
         # imputing missing column values (assumes Mask_Filled being 0/1-matrix)
         R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,n), margin="rows");
         # TODO modify removeEmpty to return zero row and n columns
         if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0)))
           Mask_Filled[,in_c] = table(R, 1, pred, n, 1);

       }
       else if (sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) != 0) # impute categorical features
       {
         # construct column selector
         selX = matrix(1,1,ncol(dX))
         selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
         selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
         # prepare train data set X and Y
         slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
         slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
         train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
         train_Y = slice1a[,in_c]
         # prepare score data set X and Y for imputing Y
         slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
         slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
         test_X =  removeEmpty(target = slice2, margin = "cols", select = selX);
         test_Y = slice2a[,in_c]
         # train classification model
         if(min(train_Y) == max(train_Y)) { # if the train_Y has only one class then do not train
           pred = matrix(min(train_Y), nrow(test_Y), 1)
           prob = matrix(1, nrow(test_Y), 1)
         }
         else {
           beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg = 0.00001,
             maxi = 50, maxii=50, verbose=FALSE)
           # predicting missing values
           [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
           prob = rowMaxs(prob)
         }

         validThreshold = prob > threshold
         pred = (pred * validThreshold) + (test_Y * (validThreshold == 0))
         # imputing missing column values (assumes Mask_Filled being 0/1-matrix)
         R = removeEmpty(target=Mask_Filled[,in_c] * seq(1,n), margin="rows");
         wR = removeEmpty(target=weightMatrix[, in_c] * seq(1,n), margin="rows");
         #TODO modify removeEmpty to return zero row and n columns
         if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) {
           Mask_Filled[,in_c] = table(R, 1, pred, n, 1);
           weightMatrix[, in_c] = table(wR, 1, prob, n, 1)
         }
       }
       i = as.integer(j)+1
       in_c = in_c + 1
     }
     X1 = X + Mask_Filled
   }
   # Finalize the predictions, if the weight for some predictions is less than threshold than do not fill-in
   # leave the values as NaN as we do not have enough confidence about the prediction
   invalidImputations = (weightMatrix < threshold) & (weightMatrix > 0)
   makeNas = replace(target = invalidImputations, pattern = 1, replacement = NaN)
   X1 = X1 + makeNas
   output = X1[,1:lastIndex]
 }


 colDist= function(Matrix[Double] X, Matrix[Double] mask)
 return (Matrix[Double] dist){

   dist = matrix(1, 1, ncol(X))
   X = replace(target=X, pattern=0, replacement=max(X)+1)
   parfor(i in 1:ncol(X))
   {
     if(as.scalar(mask[,i]) == 1)
     {
       distT = table(X[, i], 1)
       dist[1, i] = sum(distT != 0)
     }
   }

 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Built-in function Implements Multiple Imputation using Chained Equations (MICE)
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double --- Data Matrix (Recoded Matrix for categorical features)
	# cMask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
	# iter Integer 3 Number of iteration for multiple imputations
	# threshold Double 0.8 confidence value [0, 1] for robust imputation, values will only be imputed
	# if the predicted value has probability greater than threshold,
	# only applicable for categorical data
	# ---------------------------------------------------------------------------------------------


	#Output(s)
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# output Double --- imputed dataset


	# Assumption missing value are represented with empty string i.e ",," in CSV file
	# variables with suffix n are storing continuos/numeric data and variables with
	# suffix c are storing categorical data
	m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
	Double threshold = 0.8, Boolean verbose = FALSE)
	return(Matrix[Double] output)
	{

	if(ncol(X) < 2)
	stop("MICE can not be applied on single vectors.
	expected number of columns > 1 found: "+ncol(X))

	if(ncol(cMask) != ncol(X))
	stop("MICE Dimension mismatch: the columns in X != columns in mask")


	lastIndex = ncol(X);
	sumMax = sum(cMask);

	# if all features are numeric add a categorical features
	# if all features are categorical add a numeric features
	if(sumMax == 0 \| sumMax == ncol(cMask)) {
	X = cbind(X, matrix(1, nrow(X), 1))
	cMask = cbind(cMask, matrix(ifelse(sumMax==0, 1, 0), 1, 1))
	}

	# separate categorical and continuous features
	nX = removeEmpty(target=X, margin="cols", select=(cMask==0))
	cX = removeEmpty(target=X, margin="cols", select= cMask)

	# store the mask of numeric missing values
	Mask_n = is.na(nX);
	nX = replace(target=nX, pattern=NaN, replacement=0);
	# initial mean imputation
	X_n = nX+(Mask_n*colMeans(nX))

	# store the mask of categorical missing values
	Mask_c = is.na(cX);
	X_c = imputeByMode(cX)
	# initial mode imputation

	# reconstruct original matrix using sparse matrices p and q
	p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
	select=t(cMask==0)), ncol(nX), ncol(X))
	q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(cMask)), margin="rows",
	select=t(cMask)), ncol(cX), ncol(X))
	X1 = (X_n %% p) + (X_c %% q)

	Mask1 = is.na(X)

	X = replace(target=X, pattern=NaN, replacement=0);
	d = ncol(X1)
	n = nrow(X1)

	# compute index of categorical features
	encodeIndex = removeEmpty(target=t(seq(1, ncol(X1))), margin="cols", select=cMask)

	s = "";
	for(i in 1:ncol(encodeIndex))
	s = s + as.integer(as.scalar(encodeIndex[1, i])) + ",";

	# specifications for one-hot encoding of categorical features
	jspecDC = "{ids:true, dummycode:["+s+"]}";

	for(k in 1:iter) # start iterative imputation
	{
	Mask_Filled = Mask1 # use this to store predictions for missing values
	weightMatrix = Mask1 # uses this to keep track of probabilities less than threshold
	inverseMask = Mask1 == 0
	# OHE of categorical features
	[dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);
	dist = colDist(X1, cMask) # number of distinct items in categorical features
	i=1; j=1; in_c=1;

	while(i < ncol(dX))
	{
	j = (i + as.scalar(dist[1,in_c])) - 1 # index value for iterating OHE columns

	if(sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) == 0) # impute numeric features
	{
	# construct column selector
	selX = matrix(1,1,ncol(dX))
	selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
	selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
	# prepare train data set X and Y
	slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
	slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
	train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
	train_Y = slice1a[,in_c]

	# prepare score data set X and Y for imputing Y
	slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
	slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
	test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
	test_Y = slice2a[,in_c]

	# learn a regression line
	beta = lm(X=train_X, y=train_Y, verbose=FALSE, icpt=1, reg = 1e-7, tol = 1e-7);
	# predicting missing values
	pred = lmPredict(X=test_X, B=beta, ytest= matrix(0,1,1), icpt=1, verbose = FALSE)
	# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
	R = removeEmpty(target=Mask_Filled[, in_c] * seq(1,n), margin="rows");
	# TODO modify removeEmpty to return zero row and n columns
	if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0)))
	Mask_Filled[,in_c] = table(R, 1, pred, n, 1);

	}
	else if (sum(Mask1[, in_c]) > 0 & as.scalar(cMask[, in_c]) != 0) # impute categorical features
	{
	# construct column selector
	selX = matrix(1,1,ncol(dX))
	selX[1,i:j] = matrix(0,1,as.scalar(dist[1,in_c]))
	selY = cbind(matrix(1,1,in_c-1), as.matrix(0), matrix(1,1,d-in_c));
	# prepare train data set X and Y
	slice1 = removeEmpty(target = dX, margin = "rows", select = inverseMask[,in_c])
	slice1a = removeEmpty(target = X1, margin = "rows", select = inverseMask[,in_c])
	train_X = removeEmpty(target = slice1, margin = "cols", select = selX);
	train_Y = slice1a[,in_c]
	# prepare score data set X and Y for imputing Y
	slice2 = removeEmpty(target = dX, margin = "rows", select = Mask1[,in_c])
	slice2a = removeEmpty(target = X1, margin = "rows", select = Mask1[,in_c])
	test_X = removeEmpty(target = slice2, margin = "cols", select = selX);
	test_Y = slice2a[,in_c]
	# train classification model
	if(min(train_Y) == max(train_Y)) { # if the train_Y has only one class then do not train
	pred = matrix(min(train_Y), nrow(test_Y), 1)
	prob = matrix(1, nrow(test_Y), 1)
	}
	else {
	beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.0001, reg = 0.00001,
	maxi = 50, maxii=50, verbose=FALSE)
	# predicting missing values
	[prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
	prob = rowMaxs(prob)
	}

	validThreshold = prob > threshold
	pred = (pred * validThreshold) + (test_Y * (validThreshold == 0))
	# imputing missing column values (assumes Mask_Filled being 0/1-matrix)
	R = removeEmpty(target=Mask_Filled[,in_c] * seq(1,n), margin="rows");
	wR = removeEmpty(target=weightMatrix[, in_c] * seq(1,n), margin="rows");
	#TODO modify removeEmpty to return zero row and n columns
	if(!(nrow(R) == 1 & as.scalar(R[1,1] == 0))) {
	Mask_Filled[,in_c] = table(R, 1, pred, n, 1);
	weightMatrix[, in_c] = table(wR, 1, prob, n, 1)
	}
	}
	i = as.integer(j)+1
	in_c = in_c + 1
	}
	X1 = X + Mask_Filled
	}
	# Finalize the predictions, if the weight for some predictions is less than threshold than do not fill-in
	# leave the values as NaN as we do not have enough confidence about the prediction
	invalidImputations = (weightMatrix < threshold) & (weightMatrix > 0)
	makeNas = replace(target = invalidImputations, pattern = 1, replacement = NaN)
	X1 = X1 + makeNas
	output = X1[,1:lastIndex]
	}


	colDist= function(Matrix[Double] X, Matrix[Double] mask)
	return (Matrix[Double] dist){

	dist = matrix(1, 1, ncol(X))
	X = replace(target=X, pattern=0, replacement=max(X)+1)
	parfor(i in 1:ncol(X))
	{
	if(as.scalar(mask[,i]) == 1)
	{
	distT = table(X[, i], 1)
	dist[1, i] = sum(distT != 0)
	}
	}

	}