| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script will read the dirty and clean data, then it will apply the best pipeline on dirty data |
| # and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset |
| # in terms of classification accuracy |
| # |
| # INPUT: |
| # ------------------------------------------------------------------------------- |
| # trainData --- |
| # testData --- |
| # metaData --- |
| # lp --- |
| # pip --- |
| # hp --- |
| # evaluationFunc --- |
| # evalFunHp --- |
| # isLastLabel --- |
| # correctTypos --- |
| # ------------------------------------------------------------------------------- |
| # |
| # OUTPUT: |
| # ------------------------------------------------------------------------------------------------ |
| # scores --- |
| # ------------------------------------------------------------------------------------------------ |
| |
| source("scripts/pipelines/scripts/utils.dml") as utils; |
| source("scripts/builtin/topk_cleaning.dml") as topk; |
| source("scripts/builtin/bandit.dml") as bandit; |
| |
| s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), |
| Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp, |
| Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE) |
| return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState) |
| { |
| externalState = list() |
| no_of_flag_vars = 5 |
| [schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData) |
| |
| pip = removeEmpty(target=pip, margin="cols") |
| applyFunc = removeEmpty(target=applyFunc, margin="cols") |
| metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL")) |
| ctx = list(prefix="----"); #TODO include seed |
| # separate the label |
| [Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel) |
| [Xtest, Ytest] = topk::getLabel(testData, isLastLabel) |
| |
| # always recode the label |
| if(maskY == 1) { |
| [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}"); |
| eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); |
| externalState = append(externalState, M) |
| } |
| else |
| { |
| eYtrain = as.matrix(Ytrain) |
| eYtest = as.matrix(Ytest) |
| } |
| # # # when the evaluation function is called first we also compute and keep hyperparams of target application |
| ctx = list(prefix="evaluate Pipeline") |
| dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList, |
| evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx) |
| [Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx) |
| |
| # # # if mask has 1s then there are categorical features |
| [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode") |
| externalState = append(externalState, M1) |
| # # # do the early dropping |
| # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE) |
| metaList["applyFunc"] = applyFunc |
| |
| no_of_param = as.scalar(hp[1, 1]) + 1 |
| hp_width= hp[1, 2:no_of_param] |
| hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip)) |
| pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars) |
| |
| [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp, |
| pipList=pipList, metaList=metaList, evalFunc=evaluationFunc) |
| print("train score cv: "+toString(trainScore)) |
| |
| |
| # # # now test accuracy |
| [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, |
| Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) |
| |
| if(max(eYtrain) == min(eYtrain)) |
| stop("Y contains only one class") |
| |
| # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp)) |
| # trainAccuracy = as.scalar(score[1, 1]) |
| |
| score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) |
| testAccuracy = as.scalar(score[1, 1]) |
| |
| scores = matrix(0, rows=1, cols=3) |
| scores[1, 1] = dirtyScore |
| # scores[1, 2] = trainAccuracy |
| scores[1, 3] = testAccuracy |
| cleanTrain = cbind(eXtrain, eYtrain) |
| cleanTest = cbind(eXtest, eYtest) |
| } |