| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| source("scripts/pipelines/scripts/utils.dml") as utils; |
| source("scripts/builtin/bandit.dml") as bandit; |
| s_applyAndEvaluate = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), |
| Frame[Unknown] lp, Frame[Unknown] pip, Matrix[Double] hp, String evaluationFunc, Matrix[Double] evalFunHp, |
| Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE) |
| return (Matrix[Double] result) |
| { |
| no_of_flag_vars = 5 |
| schema = metaData[1, 1:ncol(metaData) - 1] |
| mask = as.matrix(metaData[2, 1:ncol(metaData) - 1]) |
| fdMask = as.matrix(metaData[3, 1:ncol(metaData) - 1]) |
| maskY = as.integer(as.scalar(metaData[2, ncol(metaData)])) |
| metaList = list(mask=mask, schema=schema, fd=fdMask) |
| |
| # separate the label |
| [Xtrain, Ytrain] = getLabel(trainData, isLastLabel) |
| [Xtest, Ytest] = getLabel(testData, isLastLabel) |
| |
| # always recode the label |
| if(maskY == 1) { |
| [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}"); |
| eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); |
| } |
| else |
| { |
| eYtrain = as.matrix(Ytrain) |
| eYtest = as.matrix(Ytest) |
| } |
| # # # when the evaluation function is called first we also compute and keep hyperparams of target application |
| dirtyScore = getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp) |
| [Xtrain, Xtest] = runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos) |
| |
| # # # if mask has 1s then there are categorical features |
| [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, FALSE, "recode") |
| |
| # construct the parameter list for best hyper-parameters if the oversampling technique is part of |
| # pipeline then take it out because oversampling is not applied on test dataset |
| # this condition is unnecessary here in this case because the input dataset is balanced and |
| # instead of diving the dataset into train/test I am doing cross validations |
| |
| no_of_param = as.scalar(hp[1, 1]) + 1 |
| hp_width= hp[1, 2:no_of_param] |
| hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip)) |
| pipList = list(lp = lp, ph = pip, hp = hp_matrix, flags = no_of_flag_vars) |
| # argList = list(X=X, Y=Y, Xtest=Xtest, Ytest=Ytest, Xorig=clone_X, pipList=pipList, metaList=metaList, evalFunHp=evalFunHp, trainML=0) |
| # # # now test accuracy |
| [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(logical=lp, pipeline=pip, X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList, |
| hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) |
| |
| if(max(eYtrain) == min(eYtrain)) |
| stop("Y contains only one class") |
| |
| score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) |
| trainAccuracy = as.scalar(score[1, 1]) |
| |
| score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) |
| testAccuracy = as.scalar(score[1, 1]) |
| |
| |
| result = matrix(0, rows=1, cols=3) |
| result[1, 1] = dirtyScore |
| result[1, 2] = trainAccuracy |
| result[1, 3] = testAccuracy |
| } |
| |
| runStringPipeline = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Frame[String] schema, |
| Matrix[Double] mask, Boolean cv, Boolean correctTypos = FALSE) |
| return(Frame[Unknown] Xtrain, Frame[Unknown] Xtest) |
| { |
| if(cv) |
| Xtrain = utils::stringProcessing(data=Xtrain, mask=mask, schema=schema, CorrectTypos=correctTypos) |
| else |
| { |
| # # # binding train and test to use same dictionary for both |
| XAll = utils::stringProcessing(data=rbind(Xtrain, Xtest), mask=mask, schema=schema, CorrectTypos=correctTypos) |
| Xtrain = XAll[1:nrow(Xtrain),] |
| Xtest = XAll[nrow(Xtrain)+1:nrow(XAll),] |
| } |
| } |
| |
| recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Matrix[Double] mask, Boolean cv, String code) |
| return(Matrix[Double] eXtrain, Matrix[Double] eXtest) |
| { |
| if(sum(mask) > 0) |
| { |
| index = vectorToCsv(mask) |
| jspecR = "{ids:true, "+code+":["+index+"]}" |
| [eXtrain, X_meta] = transformencode(target=Xtrain, spec=jspecR); |
| if(!cv) |
| eXtest = transformapply(target=Xtest, spec=jspecR, meta=X_meta); |
| else eXtest = as.matrix(Xtest) |
| } |
| # if no categorical value exist then just cast the frame into matrix |
| else { |
| eXtrain = as.matrix(Xtrain) |
| eXtest = as.matrix(Xtest) |
| } |
| } |
| |
| getLabel = function(Frame[Unknown] data, Boolean isLastLabel) |
| return(Frame[Unknown] X, Frame[Unknown] Y) |
| { |
| if(isLastLabel) { |
| X = data[, 1:ncol(data) - 1] |
| Y = data[, ncol(data)] |
| } |
| else |
| { |
| X = data |
| Y = as.frame("0") |
| } |
| } |
| |
| getDirtyScore = function(Frame[Unknown] X, Matrix[Double] Y, Frame[Unknown] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, |
| Matrix[Double] evalFunHp) |
| return(Double dirtyScore) |
| { |
| dschema = detectSchema(X) |
| dmask = matrix(0, rows=1, cols=ncol(dschema)) |
| for(i in 1:ncol(dschema)) |
| if(as.scalar(dschema[1, i]) == "STRING" | as.scalar(dschema[1, i]) == "BOOLEAN") |
| dmask[1, i] = 1 |
| mask = as.matrix(metaList['mask']) |
| mask = ifelse(sum(mask == dmask) < ncol(mask), dmask, mask) |
| [eXtrain, eXtest] = recodeData(X, Xtest, mask, FALSE, "recode") |
| eXtrain = replace(target=eXtrain, pattern=NaN, replacement=1) |
| eXtest = replace(target=eXtest, pattern=NaN, replacement=1) |
| [eXtrain, eXtest] = recodeData(as.frame(eXtrain), as.frame(eXtest), mask, FALSE, "dummycode") |
| score = eval(evaluationFunc, list(X=eXtrain, Y=Y, Xtest=eXtest, Ytest=Ytest, Xorig=as.matrix(0), evalFunHp=evalFunHp, trainML = FALSE)) |
| dirtyScore = as.scalar(score[1, 1]) |
| } |