blob: ac486a509e30833642a015be315833a2e10eac9d [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
rwOutlierByIQR = function(Matrix[Double] X, Double k =1.5, Integer repairMethod = 1,
Integer max_iterations, Boolean verbose = TRUE)
return(Matrix[Double] Y, Matrix[Double] Q1, Matrix[Double] Q3, Matrix[Double] IQR, Double k, Integer repairMethod)
{
sumPrevious = as.double(0)
sumNext = as.double(1)
counter = 0
while( max_iterations == 0 | counter < max_iterations )
{
[Q1, Q3, IQR] = compute_quartiles(X)
upperBound = (Q3 + (k * IQR));
lowerBound = (Q1 - (k * IQR));
outlierFilter = X < lowerBound | X > upperBound
if(sum(outlierFilter) > 1 & sumNext != 0 & sumPrevious != sumNext ) {
#TODO: see outlierBySd why are sumPrevious and sumNext necessary
temp = replace(target=X, pattern = NaN, replacement = 0)
sumPrevious = sum(temp)
X = fix_outliers_iqr(X, outlierFilter, repairMethod)
temp = replace(target=X, pattern = NaN, replacement = 0)
sumNext = sum(temp)
}
else
max_iterations = -1
counter = counter + 1;
}
Y = X
}
fix_outliers_iqr = function(Matrix[Double] X, Matrix[Double] outlierFilter, Integer repairMethod=1)
return(Matrix[Double] fixed_X)
{
rows = nrow(X)
cols = ncol(X)
if(repairMethod == 0) {
sel = rowMaxs(outlierFilter) == 0
X = removeEmpty(target = X, margin = "rows", select = sel)
}
else if(repairMethod == 1)
X = (outlierFilter == 0) * X
else if(repairMethod == 2)
{
outlierFilter = replace(target = (outlierFilter == 0), pattern = 0, replacement = NaN)
X = outlierFilter * X
}
else
stop("outlierByIQR: invalid argument - repair required 0-2 found: "+repairMethod)
fixed_X = X
}
compute_quartiles = function(Matrix[Double] X)
return(Matrix[Double] colQ1, Matrix[Double] colQ3, Matrix[Double] IQR)
{
cols = ncol(X)
colQ1 = matrix(0, 1, cols)
colQ3 = matrix(0, 1, cols)
if(nrow(X) > 1) {
for(i in 1:cols) {
isNull = is.na(X[, i])
sel = (isNull == 0)
Xt = removeEmpty(target=X[, i], margin="rows", select=sel)
colQ1[,i] = quantile(Xt, 0.25)
colQ3[,i] = quantile(Xt, 0.75)
}
}
IQR = colQ3 - colQ1
}
rwImputeByMean = function(Matrix[Double] X, Matrix[Double] mask)
return(Matrix[Double] X, Matrix[Double] imputedVec)
{
# mean imputation
colMean = matrix(0, rows=1, cols=ncol(X))
for(i in 1:ncol(X))
{
if(as.scalar(mask[1, i]) == 0)
{
nX = removeEmpty(target=X[, i], margin="rows", select = (is.na(X[, i]) == 0))
colMean[1, i] = mean(nX)
}
}
if(sum(mask) > 0)
{
# mode imputation
cX = X*mask
[X_c, colMode] = imputeByMode(cX)
imputedVec = colMean + colMode
}
else
imputedVec = colMean
X = imputeByMeanApply(X, imputedVec)
}
wrapIQR = function(Matrix[double] X) return (Matrix[double] out) {
#[X,q1,q3,iqr,k,r] = rwOutlierByIQR(X=X, max_iterations=0, verbose=FALSE);
[X,q1,q3,iqr,k,r] = outlierByIQR(X=X, max_iterations=10, verbose=FALSE);
while(FALSE){}
out = X;
}
wrapImputeByMode = function(Matrix[double] X) return (Matrix[double] out) {
[X, iVec]= imputeByMode(X=X);
while(FALSE){}
out = X;
}
wrapMice = function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
Double threshold = 0.8, Boolean verbose = FALSE) return(Matrix[Double] output) {
[out,meta,th,dM,betaList] = mice(X=X, cMask=cMask, iter=iter, verbose=verbose);
while(FALSE){}
output = out;
}
getAccuracy = function(Matrix[double] X, Matrix[double] y) return (Double accuracy) {
#R = crossV(X, y, 0.01, 0, 4);
beta = l2svm(X=X, Y=y);
[yRaw, yPred] = l2svmPredict(X=X, W=beta, verbose=FALSE);
accuracy = sum((yPred - y) == 0) / nrow(y) * 100;
}
#dataX = rand(rows=8000, cols=23, min=-1, max=5, seed=42);
#datay = rand(rows=8000, cols=1, min=0, max=2, seed=42);
#datay = ceil(datay);
dataX = read($1, data_type="matrix");
datay = dataX[,7];
dataX = cbind(dataX[,1:6], dataX[,8:ncol(dataX)]);
mask = matrix(0, rows=1, cols=ncol(dataX));
mask[1,1:2] = matrix(1, rows=1, cols=2);
mask[1,7] = 1;
mask[1,9] = 1;
mask[1,11] = 1;
mask[1,13] = 1;
mask[1,14:19] = matrix(1, rows=1, cols=6);
mask[1,21:23] = matrix(1, rows=1, cols=3);
accs = matrix(0, rows=1, cols=5);
# Pipeline1: imputeByMean, scale, underSampling
X = dataX;
y = datay;
[X, iVec] = imputeByMean(X=X, mask=mask);
X = scale(X=X, center=TRUE, scale=TRUE);
X = wrapIQR(X);
[X,y] = underSampling(X=X, Y=y, ratio=0.1);
acc = getAccuracy(X, y);
accs[1,1] = acc;
# Pipeline2: imputeByMean, scale, outlierByIQR
X = dataX;
y = datay;
[X, iVec]= imputeByMean(X=X, mask=mask);
X = scale(X=X, center=TRUE, scale=TRUE);
X = wrapIQR(X);
acc = getAccuracy(X, y);
accs[1,2] = acc;
dataX = dataX[,1:8];
mask = mask[,1:8];
# Pipeline: Mice
X = dataX;
y = datay;
X = wrapMice(X=X, cMask=mask, iter=1, verbose=FALSE);
acc = getAccuracy(X, y);
accs[1,1] = acc;
# Pipeline: Mice, scale
X = dataX;
y = datay;
X = wrapMice(X=X, cMask=mask, iter=1, verbose=FALSE);
[X, cn, sf]= scale(X=X, center=TRUE, scale=TRUE);
accs[1,2] = sum(X);
acc = getAccuracy(X, y);
accs[1,2] = acc;
# Pipeline: Mice, scale, outlierByIQR
X = dataX;
y = datay;
X = wrapMice(X=X, cMask=mask, iter=1, verbose=FALSE);
[X, cn, sf]= scale(X=X, center=TRUE, scale=TRUE);
X = wrapIQR(X);
acc = getAccuracy(X, y);
accs[1,3] = acc;
R = sum(accs);
write(R, $2, format="text");