| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Related to [SYSTEMDS-2662] dependency function for cleaning pipelines |
| |
| # impute the data by median value and if the feature is categorical then by mode value |
| |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double --- Data Matrix (Recoded Matrix for categorical features) |
| # mask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1) |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double --- imputed dataset |
| |
| |
| |
| m_imputeByMedian = function(Matrix[Double] X, Matrix[Double] mask) |
| return(Matrix[Double] X) |
| { |
| |
| nX = removeEmpty(target=X, margin="cols", select=(mask==0)) |
| cX = removeEmpty(target=X, margin="cols", select=mask) |
| Mask_n = is.na(nX); |
| nX = replace(target=nX, pattern=NaN, replacement=0); |
| cols = ncol(nX) |
| # median imputation |
| colMedian = matrix(0, 1, cols) |
| parfor(i in 1:cols, check=0) |
| colMedian[1, i] = median(X[,i]) |
| X_n = nX + (Mask_n * colMedian) |
| # mode imputation |
| X_c = imputeByMode(cX) |
| |
| p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", |
| select=t(mask==0)), ncol(nX), ncol(X)) |
| q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", |
| select=t(mask)), ncol(cX), ncol(X)) |
| X = (X_n %*% p) + (X_c %*% q) |
| } |