scripts/builtin/imputeByFD.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Implements builtin for imputing missing values from observed values (if exist)
 # using robust functional dependencies
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double    --       Matrix X
 # source          Integer   --       source attribute to use for imputation and error correction
 # target          Integer   --       attribute to be fixed
 # threshold       Double    --       threshold value in interval [0, 1] for robust FDs
 # ---------------------------------------------------------------------------------------------


 #Output(s)
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double   ---        Matrix with possible imputations


 m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold, Boolean verbose = FALSE)
   return(Matrix[Double] X)
 {
   # sanity checks
   if( threshold < 0 | threshold > 1 )
     stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)

   if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 | targetAttribute > ncol(X))
     stop("Stopping due to invalid source and target")

   # impute missing values and fix errors
   X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold)

   if(verbose)
     print("output \n"+toString(X))
 }

 imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)
   return(Matrix[Double] imputed_Y) {

   XY = cbind(X, Y)

   # replace the NaN values with zero
   XY = replace(target = XY, pattern=NaN, replacement=0)
   missing_mask = (XY == 0)

   # map the missing values to an arbitrary number (i.e., Max values + 1)
   XY = missing_mask * (colMaxs(XY)+1) + XY

   # create mapping between source and target
   ctab = table(XY[,1], XY[,2], 1)

   # remove the table column representing missing values
   if(sum(missing_mask[,2]) > 0)
     ctab = ctab[,1:ncol(ctab)-1]

   ctab = ctab/(rowSums(ctab)) > threshold

   # Get the most frequent mapped value of Y
   ans = (ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab))) # rowIndexMax(ctab)?
   tabMax = rowSums(ans) != (ncol(ans) * ((ncol(ans))+1)/2) # vector for controlling max(0)
   filled = rowMaxs(ans) * tabMax
   imputed_Y = table(seq(1,nrow(X)), XY[,1]) %*% filled;
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Implements builtin for imputing missing values from observed values (if exist)
	# using robust functional dependencies
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double -- Matrix X
	# source Integer -- source attribute to use for imputation and error correction
	# target Integer -- attribute to be fixed
	# threshold Double -- threshold value in interval [0, 1] for robust FDs
	# ---------------------------------------------------------------------------------------------


	#Output(s)
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double --- Matrix with possible imputations


	m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold, Boolean verbose = FALSE)
	return(Matrix[Double] X)
	{
	# sanity checks
	if( threshold < 0 \| threshold > 1 )
	stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)

	if(sourceAttribute < 0 \| sourceAttribute > ncol(X) \| targetAttribute < 0 \| targetAttribute > ncol(X))
	stop("Stopping due to invalid source and target")

	# impute missing values and fix errors
	X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold)

	if(verbose)
	print("output \n"+toString(X))
	}

	imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)
	return(Matrix[Double] imputed_Y) {

	XY = cbind(X, Y)

	# replace the NaN values with zero
	XY = replace(target = XY, pattern=NaN, replacement=0)
	missing_mask = (XY == 0)

	# map the missing values to an arbitrary number (i.e., Max values + 1)
	XY = missing_mask * (colMaxs(XY)+1) + XY

	# create mapping between source and target
	ctab = table(XY[,1], XY[,2], 1)

	# remove the table column representing missing values
	if(sum(missing_mask[,2]) > 0)
	ctab = ctab[,1:ncol(ctab)-1]

	ctab = ctab/(rowSums(ctab)) > threshold

	# Get the most frequent mapped value of Y
	ans = (ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab))) # rowIndexMax(ctab)?
	tabMax = rowSums(ans) != (ncol(ans) * ((ncol(ans))+1)/2) # vector for controlling max(0)
	filled = rowMaxs(ans) * tabMax
	imputed_Y = table(seq(1,nrow(X)), XY[,1]) %*% filled;
	}