blob: 01281d2bcca7016289d3680668f68ad2f238e7e8 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Implements builtin for imputing missing values from observed values (if exist)
# using robust functional dependencies
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double -- Matrix X
# source Integer -- source attribute to use for imputation and error correction
# target Integer -- attribute to be fixed
# threshold Double -- threshold value in interval [0, 1] for robust FDs
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Matrix with possible imputations
m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold)
return(Matrix[Double] X)
{
# sanity checks
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)
if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 | targetAttribute > ncol(X))
stop("Stopping due to invalid source and target")
# impute missing values and fix errors
X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold)
}
imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)
return(Matrix[Double] imputed_Y) {
XY = cbind(X, Y)
# replace the NaN values with zero
XY = replace(target = XY, pattern=NaN, replacement=0)
missing_mask = (XY == 0)
# map the missing values to an arbitrary number (i.e., Max values + 1)
XY = missing_mask * (colMaxs(XY)+1) + XY
# create mapping between source and target
ctab = table(XY[,1], XY[,2], 1)
# remove the table column representing missing values
if(sum(missing_mask[,2]) > 0)
ctab = ctab[,1:ncol(ctab)-1]
ctab = ctab/(rowSums(ctab)) > threshold
# Get the most frequent mapped value of Y
ans = (ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab))) # rowIndexMax(ctab)?
tabMax = rowSums(ans) != (ncol(ans) * ((ncol(ans))+1)/2) # vector for controlling max(0)
filled = rowMaxs(ans) * tabMax
imputed_Y = table(seq(1,nrow(X)), XY[,1]) %*% filled;
}