blob: d4dc5ef41b9e56ac5d534ad897e7618713326dc3 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Builtin function for detecting and repairing outliers using standard deviation
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Double --- Matrix X
# k Double 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
# repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros
# 2 = replace outliers as missing values
# max_iterations Integer 0 values: 0 = arbitrary number of iteration until all outliers are removed,
# n = any constant defined by user
# ---------------------------------------------------------------------------------------------
#Output(s)
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# Y Double --- Matrix X with no outliers
m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod = 1,
Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y)
{
sumPrevious = 0
sumNext = 1
counter = 0
if( k < 1 | k > 7)
stop("outlierBySd: invalid argument - k should be in range 1-7 found "+k)
while( max_iterations == 0 | counter < max_iterations )
{
colSD = colSds(X)
colMean = (colMeans(X))
upperBound = colMean + k * colSD
lowerBound = colMean - k * colSD
outlierFilter = (X < lowerBound) | (X > upperBound)
if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext) {
#TODO why is the check with sumPrevious and sumNext necessary
temp = replace(target=X, pattern = NaN, replacement = 0)
sumPrevious = sum(temp)
X = fix_outliers_sd(X, outlierFilter, repairMethod)
temp = replace(target=X, pattern = NaN, replacement = 0)
sumNext = sum(temp)
}
else
max_iterations = - 1;
counter = counter + 1;
}
Y = X
if(verbose) {
print("last outlier filter:\n"+ toString(outlierFilter))
print("Total executed iterations = "+counter)
print("Upper-bound of data was calculated using Mean + k * Standard Deviation")
print("lower-bound of data was calculated using Mean - k * Standard Deviation")
print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier")
if(sum(Y) == 0)
print("output is a zero matrix due to iterative evaluation of outliers ")
print("output:\n"+ toString(Y))
}
}
fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 2)
return(Matrix[Double] fixed_X)
{
rows = nrow(X)
cols = ncol(X)
if(repairMethod == 0) {
sel = (rowMaxs(outlierFilter) == 0)
X = removeEmpty(target = X, margin = "rows", select = sel)
}
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
else if (repairMethod == 2) {
outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, replacement = NaN)
X = outlierFilter * X
}
else
stop("outlierBySd: invalid argument - repair required 0-1 found: "+repairMethod)
fixed_X = X
}