blob: 34ae24bbe255555197e2a7676fd46f1e6c13fc4d [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
##################################################################################################################
# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
# in terms of classification accuracy
# Vocab = original data -> dataset without any noise, the original version with ground truths
# cleaned data -> dirty dataset cleaned by pipeline
# read the items
# dirty dataset F
# clean dataset O
# metadata (schema and mask)
# best k pipelines and hyperparameters generated by previous script mainScript.dml
# do the initial preprocessing like dropping invalid values so that pipeline could fix them
# then recode the data to bring it into matrix format
# then construct the hyper-parameters list and call the executePipeline() on the dirty dataset
# for the comparison OHE the original dataset, there is no need to OHE the cleaned dataset because cleaning pipeline
# has a primitive for this
# Call the multilogReg on both of the datasets and compare accuracy on k=3 cross validation
######################################################################################################################
source("scripts/pipelines/scripts/utils.dml") as utils;
F = read($1, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
metaInfo = read($2, data_type="frame", format="csv", header=FALSE);
input = $3
pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE);
applyFunc = read(input+"applyFunc.csv", data_type="frame", format="csv", header=FALSE);
hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE);
evalHp = read(input+"evalHp.csv", data_type="matrix", format="csv", header=FALSE);
# dirtyScore = read(input+"dirtyScore.csv", data_type="scalar", value_type="double");
trainTestSplit = as.double($5)
metaInfo = metaInfo[, 2:ncol(metaInfo)]
split = nrow(F) * trainTestSplit
trainData = F[1:split,]
testData = F[split+1:nrow(F),]
print("pipeline: "+toString(pip[1]))
[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE)
eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE)
result = as.frame(result)
resultBool = as.scalar(result[1, 3] > result[1, 1])
eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
tsX = replace(target=tsX, pattern=NaN, replacement=0)
resApply = sum(eXtest[51:111] - tsX[51:111, 1:ncol(eXtest)]) == 0
percent = sum(eXtest-tsX[, 1:ncol(eXtest)] > 0) / (nrow(eXtest) * ncol(eXtest))
errorMargin = percent < 0.05
resultBool = resultBool & errorMargin
write(resultBool, $6)
header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
writeRes = rbind(header, result)
print(toString(writeRes))
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
Matrix[Double] evalFunHp)
return(Matrix[Double] output, Matrix[Double] error)
{
if(is.na(as.scalar(evalFunHp[1,1])))
{
nc = max(Y);
params = list("icpt", "reg", "tol", "maxi")
paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5), 10^seq(1,3));
trainArgs = list(X=X, y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=100, maxii=-1, verbose=FALSE);
[B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
params=params, paramValues=paramRanges, trainArgs=trainArgs, cv=TRUE, cv=3, verbose=FALSE);
evalFunHp = as.matrix(opt)
}
if(min(Y) == max(Y))
{
accuracy = as.matrix(0)
a = 0
}
else {
beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
maxi=1000, maxii=0, verbose=FALSE);
[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
error = yhat != Ytest
a = getAccuracy(Ytest, yhat, TRUE)
accuracy = as.matrix(accuracy)
print("accuracy: "+toString(accuracy))
}
output = cbind(accuracy, evalFunHp)
}
accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
err = as.matrix(1-(acc/100));
}