blob: c53ed1cfef1674390a363b010b184a8f4cd8bb8c [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Builtin function for detecting and repairing outliers using standard deviation
#
# INPUT:
# ----------------------------------------------------------------------------------------
# X Matrix X
# k threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
# repairMethod values: 0 = delete rows having outliers, 1 = replace outliers as zeros
# 2 = replace outliers as missing values
# max_iterations values: 0 = arbitrary number of iteration until all outliers are removed,
# n = any constant defined by user
# ----------------------------------------------------------------------------------------
#
# OUTPUT:
# -------------------------------------------------------------------------------
# Y Matrix X with no outliers
# -------------------------------------------------------------------------------
m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod = 1,
Integer max_iterations, Boolean verbose = TRUE)
return(Matrix[Double] out, Matrix[Double] colMean, Matrix[Double] colSD, Double k, Integer repairMethod)
{
# variable initialization
sumPrevious = as.double(0)
sumNext = as.double(1)
counter = 0
outlierFilter = as.matrix(0)
if( k < 1 | k > 10)
stop("outlierBySd: invalid argument - k should be in range 1-10 found "+k)
while( max_iterations == 0 | counter < max_iterations )
{
[colMean, colSD] = getColMean_Sd(X)
upperBound = colMean + k * colSD
lowerBound = colMean - k * colSD
outlierFilter = (X < lowerBound) | (X > upperBound)
if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext) {
#TODO why is the check with sumPrevious and sumNext necessary
temp = replace(target=X, pattern = NaN, replacement = 0)
sumPrevious = sum(temp)
X = fix_outliers_sd(X, outlierFilter, repairMethod)
temp = replace(target=X, pattern = NaN, replacement = 0)
sumNext = sum(temp)
}
else
max_iterations = - 1;
counter = counter + 1;
}
out = X
if(verbose) {
print("Upper-bound of data was calculated using Mean + k * Standard Deviation")
print("lower-bound of data was calculated using Mean - k * Standard Deviation")
print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier")
if(sum(out) == 0)
print("output is a zero matrix due to iterative evaluation of outliers ")
}
}
fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 2)
return(Matrix[Double] X)
{
if(repairMethod == 0)
X = removeEmpty(target = X, margin = "rows", select = (rowMaxs(outlierFilter) == 0))
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
else if (repairMethod == 2) {
outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, replacement = NaN)
X = outlierFilter * X
}
else
stop("outlierBySd: invalid argument - repair required 0-1 found: "+repairMethod)
}
getColMean_Sd = function(Matrix[Double] X)
return(Matrix[Double] colMean, Matrix[Double] colSd)
{
colMean = matrix(0, 1, ncol(X))
colSd = matrix(0, 1, ncol(X))
for(i in 1:ncol(X))
{
Xt = replace(target=X[, i], pattern=NaN, replacement=0)
Xt = removeEmpty(target=Xt, margin="rows")
colMean[1, i] = mean(Xt)
colSd[1, i] = sd(Xt)
}
}