| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Implements builtin for imputing missing values from observed values (if exist) |
| # using robust functional dependencies |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # F String -- Data frame |
| # source Integer -- source attribute to use for imputation and error correction |
| # target Integer -- attribute to be fixed |
| # threshold Double -- threshold value in interval [0, 1] for robust FDs |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # imputed_F String --- Frame with possible imputations |
| |
| |
| s_imputeByFD = function(Frame[String] F, Integer sourceAttribute, Integer targetAttribute, Double threshold) |
| return(Frame[String] imputed_F) |
| { |
| |
| # sanity checks |
| if( threshold < 0 | threshold > 1 ) |
| stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold) |
| |
| if(sourceAttribute < 0 | sourceAttribute > ncol(F) | targetAttribute < 0 | targetAttribute > ncol(F)) |
| stop("Stopping due to invalid source and target") |
| |
| |
| # detect schema for transformation |
| schema = detectSchema(F) |
| s="" |
| for(i in 1: ncol(F)) { |
| if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" ) |
| s = s+as.integer(i)+","; |
| } |
| |
| # recode data frame |
| jspecR = "{ids:true, recode:["+s+"]}"; |
| [X, M] = transformencode(target=F, spec=jspecR); |
| |
| # impute missing values and fix errors |
| X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold) |
| |
| # getting the actual data back |
| dF = transformdecode(target=X, spec=jspecR, meta=M); |
| imputed_F = dF; |
| } |
| |
| imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold) |
| return(Matrix[Double] imputed_Y) { |
| |
| XY = cbind(X, Y) |
| |
| # replace the NaN values with zero |
| XY = replace(target = XY, pattern=NaN, replacement=0) |
| missing_mask = (XY == 0) |
| |
| # map the missing values to an arbitrary number (i.e., Max values + 1) |
| XY = missing_mask * (colMaxs(XY)+1) + XY |
| |
| # create mapping between source and target |
| ctab = table(XY[,1], XY[,2], 1) |
| |
| # remove the table column representing missing values |
| if(sum(missing_mask[,2]) > 0) |
| ctab = ctab[,1:ncol(ctab)-1] |
| |
| ctab = ctab/(rowSums(ctab)) > threshold |
| |
| # Get the most frequent mapped value of Y |
| ans = (ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab))) # rowIndexMax(ctab)? |
| tabMax = rowSums(ans) != (ncol(ans) * ((ncol(ans))+1)/2) # vector for controlling max(0) |
| filled = rowMaxs(ans) * tabMax |
| imputed_Y = table(seq(1,nrow(X)), XY[,1]) %*% filled; |
| } |