blob: ff06dac817ecd8947cec5ede61b692d60199b7ac [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
# impute the data by median value and if the feature is categorical then by mode value
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Data Matrix (Recoded Matrix for categorical features)
# mask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- imputed dataset
m_imputeByMedian = function(Matrix[Double] X, Matrix[Double] mask)
return(Matrix[Double] X)
{
nX = removeEmpty(target=X, margin="cols", select=(mask==0))
cX = removeEmpty(target=X, margin="cols", select=mask)
Mask_n = is.na(nX);
nX = replace(target=nX, pattern=NaN, replacement=0);
cols = ncol(nX)
# median imputation
colMedian = matrix(0, 1, cols)
parfor(i in 1:cols, check=0)
colMedian[1, i] = median(X[,i])
X_n = nX + (Mask_n * colMedian)
# mode imputation
X_c = imputeByMode(cX)
p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask==0)), ncol(nX), ncol(X))
q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask)), ncol(cX), ncol(X))
X = (X_n %*% p) + (X_c %*% q)
}