| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| |
| # Builtin function for detecting and repairing outliers using standard deviation |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double --- Matrix X |
| # k Double 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule) |
| # repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros |
| # 2 = replace outliers as missing values |
| # max_iterations Integer 0 values: 0 = arbitrary number of iteration until all outliers are removed, |
| # n = any constant defined by user |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # Y Double --- Matrix X with no outliers |
| |
| m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod = 1, |
| Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y) |
| { |
| sumPrevious = 0 |
| sumNext = 1 |
| counter = 0 |
| |
| if( k < 1 | k > 7) |
| stop("outlierBySd: invalid argument - k should be in range 1-7 found "+k) |
| |
| while( max_iterations == 0 | counter < max_iterations ) |
| { |
| colSD = colSds(X) |
| colMean = (colMeans(X)) |
| |
| upperBound = colMean + k * colSD |
| lowerBound = colMean - k * colSD |
| |
| outlierFilter = (X < lowerBound) | (X > upperBound) |
| |
| if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext) { |
| #TODO why is the check with sumPrevious and sumNext necessary |
| temp = replace(target=X, pattern = NaN, replacement = 0) |
| sumPrevious = sum(temp) |
| X = fix_outliers_sd(X, outlierFilter, repairMethod) |
| temp = replace(target=X, pattern = NaN, replacement = 0) |
| sumNext = sum(temp) |
| } |
| else |
| max_iterations = - 1; |
| |
| counter = counter + 1; |
| } |
| Y = X |
| if(verbose) { |
| print("last outlier filter:\n"+ toString(outlierFilter)) |
| print("Total executed iterations = "+counter) |
| print("Upper-bound of data was calculated using Mean + k * Standard Deviation") |
| print("lower-bound of data was calculated using Mean - k * Standard Deviation") |
| print("Anything less than the lower-bound and greater than the upper-bound was treated as outlier") |
| if(sum(Y) == 0) |
| print("output is a zero matrix due to iterative evaluation of outliers ") |
| print("output:\n"+ toString(Y)) |
| } |
| } |
| |
| fix_outliers_sd = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod = 2) |
| return(Matrix[Double] fixed_X) |
| { |
| rows = nrow(X) |
| cols = ncol(X) |
| if(repairMethod == 0) { |
| sel = (rowMaxs(outlierFilter) == 0) |
| X = removeEmpty(target = X, margin = "rows", select = sel) |
| } |
| else if(repairMethod == 1) |
| X = (outlierFilter == 0) * X |
| else if (repairMethod == 2) { |
| outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, replacement = NaN) |
| X = outlierFilter * X |
| } |
| else |
| stop("outlierBySd: invalid argument - repair required 0-1 found: "+repairMethod) |
| |
| fixed_X = X |
| } |