blob: 84549f199d99a5894aead1d719dc129eac5b565c [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Generate the logical pipelines for data cleaning
source("scripts/pipelines/scripts/utils.dml") as utils;
# read the inputs
F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999", "99999.00"]);
metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
param = read($parameters, data_type = "frame", format="csv", header= TRUE)
topK = $topk
resources = $rv
expectedIncrease=$expectedIncrease
sample=$sample
max_iter=$max_iter
output=$output
testCV = as.logical($testCV)
cvk = as.integer($cvk)
trainTestSplit = as.double($split)
evalFunc = "evalClassification"
split = nrow(F) * trainTestSplit
if(testCV) {
trainData = F
testData = frame("", rows=0, cols=0)
}
else {
trainData = F[1:split,]
testData = F[split+1:nrow(F),]
}
if(nrow(metaInfo) < 2)
stop("incomplete meta info")
metaInfo = metaInfo[, 2:ncol(metaInfo)]
# # # split in train/test 70/30
[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE,
expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE)
write(topKPipelines, output+"/pip.csv", format="csv")
write(topKHyperParams, output+"/hp.csv", format="csv")
write(topKScores, output+"/bestAcc.csv", format="csv")
write(baseLineScore, output+"/dirtyScore.csv", format="csv")
write(evalFunHp, output+"/evalHp.csv", format="csv")
write(applyFunc, output+"/applyFunc.csv", format="csv")
result = baseLineScore < as.scalar(topKScores[1, 1])
write(result, $O)
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
Matrix[Double] evalFunHp)
return(Matrix[Double] output, Matrix[Double] error)
{
if(is.na(as.scalar(evalFunHp[1,1])))
{
nc = max(Y);
params = list("icpt", "reg", "tol")
paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5));
trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=1000, maxii=-1, verbose=FALSE);
dataArgs = list("X", "Y");
# [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
# params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE);
# evalFunHp = as.matrix(opt) # opt #
opt = matrix("2 10 0.001", rows=1, cols=3)
evalFunHp = opt
}
if(min(Y) == max(Y))
{
accuracy = as.matrix(0)
a = 0
}
else {
beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
maxi=1000, maxii=0, verbose=FALSE);
[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
error = yhat != Ytest
accuracy = as.matrix(accuracy)
}
output = cbind(accuracy, evalFunHp)
}
accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
err = as.matrix(1-(acc/100));
}
evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
Matrix[Double] evalFunHp)
return(Matrix[Double] output, Matrix[Double] error)
{
if(is.na(as.scalar(evalFunHp[1,1])))
{
nc = max(Y);
params = list("intercept", "reg", "epsilon")
paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5));
trainArgs = list(X=X, Y=Y, intercept=-1, reg=-1, epsilon=-1, maxIterations=1000, verbose=FALSE);
dataArgs = list("X", "Y");
[B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc),
params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=FALSE);
evalFunHp = as.matrix(opt) # opt #
# opt = matrix("2 10 0.001", rows=1, cols=3)
# evalFunHp = opt
}
if(min(Y) == max(Y))
{
accuracy = as.matrix(0)
a = 0
}
else {
beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]),
maxIterations=1000, verbose=FALSE);
yhat = msvmPredict(X=Xtest, W=beta);
yhat = rowIndexMax(yhat)
accuracy = mean(yhat == Ytest) * 100
error = yhat != Ytest
accuracy = as.matrix(accuracy)
}
output = cbind(accuracy, evalFunHp)
}
accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
yhat = msvmPredict(X=X, W=B);
yhat = rowIndexMax(yhat)
acc = mean(yhat == y)
err = as.matrix(1-(acc));
}