blob: f938fa3cf6f56984f17fcdbd8902bfb2fc7299dc [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Builtin function for detecting and repairing outliers using standard deviation
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Matrix X
# k Double 1.5 a constant used to discern outliers k*IQR
# isIterative Boolean TRUE iterative repair or single repair
# repairMethod Integer 1 values: 0 = delete rows having outliers,
# 1 = replace outliers with zeros
# 2 = replace outliers as missing values
# max_iterations Integer 0 values: 0 = arbitrary number of iteraition until all outliers are removed,
# n = any constant defined by user
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# Y Double --- Matrix X with no outliers
m_outlierByIQR = function(Matrix[Double] X, Double k =1.5, Integer repairMethod = 1,
Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y)
{
sumPrevious =0
sumNext = 1
counter = 0
while( max_iterations == 0 | counter < max_iterations )
{
[Q1, Q3, IQR] = compute_quartiles(X)
upperBound = (Q3 + (k * IQR));
lowerBound = (Q1 - (k * IQR));
outlierFilter = X < lowerBound | X > upperBound
if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext ) {
#TODO: see outlierBySd why are sumPrevious and sumNext necessary
temp = replace(target=X, pattern = NaN, replacement = 0)
sumPrevious = sum(temp)
X = fix_outliers_iqr(X, outlierFilter, repairMethod)
temp = replace(target=X, pattern = NaN, replacement = 0)
sumNext = sum(temp)
}
else
max_iterations = -1
counter = counter + 1;
}
Y = X
if(verbose) {
print("Total executed iterations = "+counter)
print("Upper-bound of data was calculated using Q3 + k * IQR")
print("lower-bound of data was calculated using Q3 - k * IQR")
print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier")
if(sum(Y) == 0)
print("output is a zero matrix due to iterative evaluation of outliers ")
print("output:\n"+ toString(Y))
}
}
fix_outliers_iqr = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 1)
return(Matrix[Double] fixed_X)
{
rows = nrow(X)
cols = ncol(X)
if(repairMethod == 0) {
sel = rowMaxs(outlierFilter) == 0
X = removeEmpty(target = X, margin = "rows", select = sel)
}
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
else if(repairMethod == 2)
{
outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, replacement = NaN)
X = outlierFilter * X
}
else
stop("outlierByIQR: invalid argument - repair required 0-2 found: "+repairMethod)
fixed_X = X
}
compute_quartiles = function(Matrix[Double] X)
return(Matrix[Double] colQ1, Matrix[Double] colQ3, Matrix[Double] IQR)
{
cols = ncol(X)
colQ1 = matrix(0, 1, cols)
colQ3 = matrix(0, 1, cols)
if(nrow(X) > 1) {
parfor(i in 1:cols) {
colQ1[,i] = quantile(X[,i], 0.25)
colQ3[,i] = quantile(X[,i], 0.75)
}
}
IQR = colQ3 - colQ1
}