| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Implements builtin for imputing missing values from observed values (if exist) |
| # using robust functional dependencies |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double -- Matrix X |
| # source Integer -- source attribute to use for imputation and error correction |
| # target Integer -- attribute to be fixed |
| # threshold Double -- threshold value in interval [0, 1] for robust FDs |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double --- Matrix with possible imputations |
| |
| |
| m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold, Boolean verbose = FALSE) |
| return(Matrix[Double] X) |
| { |
| # sanity checks |
| if( threshold < 0 | threshold > 1 ) |
| stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold) |
| |
| if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 | targetAttribute > ncol(X)) |
| stop("Stopping due to invalid source and target") |
| |
| # impute missing values and fix errors |
| X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold) |
| |
| if(verbose) |
| print("output \n"+toString(X)) |
| } |
| |
| imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold) |
| return(Matrix[Double] imputed_Y) { |
| |
| XY = cbind(X, Y) |
| |
| # replace the NaN values with zero |
| XY = replace(target = XY, pattern=NaN, replacement=0) |
| missing_mask = (XY == 0) |
| |
| # map the missing values to an arbitrary number (i.e., Max values + 1) |
| XY = missing_mask * (colMaxs(XY)+1) + XY |
| |
| # create mapping between source and target |
| ctab = table(XY[,1], XY[,2], 1) |
| |
| # remove the table column representing missing values |
| if(sum(missing_mask[,2]) > 0) |
| ctab = ctab[,1:ncol(ctab)-1] |
| |
| ctab = ctab/(rowSums(ctab)) > threshold |
| |
| # Get the most frequent mapped value of Y |
| ans = (ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab))) # rowIndexMax(ctab)? |
| tabMax = rowSums(ans) != (ncol(ans) * ((ncol(ans))+1)/2) # vector for controlling max(0) |
| filled = rowMaxs(ans) * tabMax |
| imputed_Y = table(seq(1,nrow(X)), XY[,1]) %*% filled; |
| } |