blob: 13aa9006ab96eae403a5c462b75bb5ec0fde105f [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/builtin/bandit.dml") as bandit;
s_applyAndEvaluate = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] lp, Frame[Unknown] pip, Matrix[Double] hp, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] result)
{
no_of_flag_vars = 5
schema = metaData[1, 1:ncol(metaData) - 1]
mask = as.matrix(metaData[2, 1:ncol(metaData) - 1])
fdMask = as.matrix(metaData[3, 1:ncol(metaData) - 1])
maskY = as.integer(as.scalar(metaData[2, ncol(metaData)]))
metaList = list(mask=mask, schema=schema, fd=fdMask)
# separate the label
[Xtrain, Ytrain] = getLabel(trainData, isLastLabel)
[Xtest, Ytest] = getLabel(testData, isLastLabel)
# always recode the label
if(maskY == 1) {
[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
}
else
{
eYtrain = as.matrix(Ytrain)
eYtest = as.matrix(Ytest)
}
# # # when the evaluation function is called first we also compute and keep hyperparams of target application
dirtyScore = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp)
[Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos)
# # # if mask has 1s then there are categorical features
[eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, FALSE, "recode")
# construct the parameter list for best hyper-parameters if the oversampling technique is part of
# pipeline then take it out because oversampling is not applied on test dataset
# this condition is unnecessary here in this case because the input dataset is balanced and
# instead of diving the dataset into train/test I am doing cross validations
no_of_param = as.scalar(hp[1, 1]) + 1
hp_width= hp[1, 2:no_of_param]
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
pipList = list(lp = lp, ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
# argList = list(X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, Xorig=clone_X, pipList=pipList, metaList=metaList, evalFunHp=evalFunHp, trainML=0)
# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList,
hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
trainAccuracy = as.scalar(score[1, 1])
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
testAccuracy = as.scalar(score[1, 1])
result = matrix(0, rows=1, cols=3)
result[1, 1] = dirtyScore
result[1, 2] = trainAccuracy
result[1, 3] = testAccuracy
}
runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[String] schema,
Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE)
return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest)
{
if(cv)
Xtrain = utils::stringProcessing(data=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos)
else
{
# # # binding train and test to use same dictionary for both
XAll = utils::stringProcessing(data=rbind(Xtrain, Xtest), mask=mask, schema=schema, CorrectTypos=correctTypos)
Xtrain = XAll[1:nrow(Xtrain),]
Xtest = XAll[nrow(Xtrain)+1:nrow(XAll),]
}
}
recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Matrix[Double] mask, Boolean cv, String code)
return(Matrix[Double] eXtrain, Matrix[Double] eXtest)
{
if(sum(mask) > 0)
{
index = vectorToCsv(mask)
jspecR = "{ids:true, "+code+":["+index+"]}"
[eXtrain, X_meta] = transformencode(target=Xtrain, spec=jspecR);
if(!cv)
eXtest = transformapply(target=Xtest, spec=jspecR, meta=X_meta);
else eXtest = as.matrix(Xtest)
}
# if no categorical value exist then just cast the frame into matrix
else {
eXtrain = as.matrix(Xtrain)
eXtest = as.matrix(Xtest)
}
}
getLabel = function(Frame[Unknown] data, Boolean isLastLabel)
return(Frame[Unknown] X, Frame[Unknown] Y)
{
if(isLastLabel) {
X = data[, 1:ncol(data) - 1]
Y = data[, ncol(data)]
}
else
{
X = data
Y = as.frame("0")
}
}
getDirtyScore = function(Frame[Unknown] X, Matrix[Double] Y, Frame[Unknown] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc,
Matrix[Double] evalFunHp)
return(Double dirtyScore)
{
dschema = detectSchema(X)
dmask = matrix(0, rows=1, cols=ncol(dschema))
for(i in 1:ncol(dschema))
if(as.scalar(dschema[1, i]) == "STRING" | as.scalar(dschema[1, i]) == "BOOLEAN")
dmask[1, i] = 1
mask = as.matrix(metaList['mask'])
mask = ifelse(sum(mask == dmask) < ncol(mask), dmask, mask)
[eXtrain, eXtest] = recodeData(X, Xtest, mask, FALSE, "recode")
eXtrain = replace(target=eXtrain, pattern=NaN, replacement=1)
eXtest = replace(target=eXtest, pattern=NaN, replacement=1)
[eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, FALSE, "dummycode")
score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE))
dirtyScore = as.scalar(score[1, 1])
}