blob: 8b3af646354cb0efe02b964fd6acf69d413fda3b [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Implements builtin for imputing missing values from observed values (if exist) using robust functional dependencies
#
# INPUT:
# --------------------------------------------------------------------------------------
# X Vector X, source attribute of functional dependency
# Y Vector Y, target attribute of functional dependency and imputation
# threshold threshold value in interval [0, 1] for robust FDs
# verbose flag for printing verbose debug output
# --------------------------------------------------------------------------------------
#
# OUTPUT:
# ----------------------------------------------------------------------------------
# Y Vector Y, with missing values mapped to a new max value
# Y_imp Vector Y, with imputed missing values
# ----------------------------------------------------------------------------------
m_imputeByFD = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Boolean verbose = FALSE)
return(Matrix[Double] Y, Matrix[Double] Y_imp)
{
# validation checks
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)
if(min(X) < 1 | min(Y) < 1) {
print("imputeByFD: source or target contain values less than 1")
Y = matrix(0, 1, 1);
Y_imp = matrix(0, 1, 1);
}
else {
# impute missing values and fix errors
[Y, Y_imp] = imputeAndCorrect(X, Y, threshold)
}
if(verbose)
print("output \n"+toString(X))
}
imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)
return(Matrix[Double] imputed_Y, Matrix[Double] filled) {
missing_mask_Y = is.na(Y)
# replace the NaN values with zero
X = replace(target = X, pattern=NaN, replacement=1)
Y = replace(target = Y, pattern=NaN, replacement=0)
maxVal = max(Y)+1
# map the missing values to an arbitrary number (i.e., Max values + 1)
Y = (missing_mask_Y * maxVal) + (Y * (missing_mask_Y == 0))
# create mapping between source and target
ctab = table(X, Y)
# remove the table column representing missing values
if(sum(missing_mask_Y) > 0 & ncol(ctab) > 1)
ctab = ctab[,1:ncol(ctab)-1]
# copmute vector of qualifying max count per row (source value)
I = (rowMaxs(ctab)/rowSums(ctab)) > threshold
# Get the most frequent mapped value of Y
filled = rowIndexMax(ctab) * I
imputed_Y = imputeByFDApply(X, filled)
}