blob: e31bf65676528158d99d84a1cc220a51f4129bd9 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
# in terms of classification accuracy
#
# INPUT:
# -------------------------------------------------------------------------------
# trainData ---
# testData ---
# metaData ---
# lp ---
# pip ---
# hp ---
# evaluationFunc ---
# evalFunHp ---
# isLastLabel ---
# correctTypos ---
# -------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------------------------
# scores ---
# ------------------------------------------------------------------------------------------------
source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/builtin/topk_cleaning.dml") as topk;
source("scripts/builtin/bandit.dml") as bandit;
s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
{
externalState = list()
no_of_flag_vars = 5
[schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData)
pip = removeEmpty(target=pip, margin="cols")
applyFunc = removeEmpty(target=applyFunc, margin="cols")
metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
ctx = list(prefix="----"); #TODO include seed
# separate the label
[Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel)
[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)
# always recode the label
if(maskY == 1) {
[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
externalState = append(externalState, M)
}
else
{
eYtrain = as.matrix(Ytrain)
eYtest = as.matrix(Ytest)
}
# # # when the evaluation function is called first we also compute and keep hyperparams of target application
ctx = list(prefix="evaluate Pipeline")
dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList,
evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx)
[Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)
# # # if mask has 1s then there are categorical features
[eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
externalState = append(externalState, M1)
# # # do the early dropping
# [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
metaList["applyFunc"] = applyFunc
no_of_param = as.scalar(hp[1, 1]) + 1
hp_width= hp[1, 2:no_of_param]
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
print("train score cv: "+toString(trainScore))
# # # now test accuracy
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
if(max(eYtrain) == min(eYtrain))
stop("Y contains only one class")
# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
# trainAccuracy = as.scalar(score[1, 1])
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
testAccuracy = as.scalar(score[1, 1])
scores = matrix(0, rows=1, cols=3)
scores[1, 1] = dirtyScore
# scores[1, 2] = trainAccuracy
scores[1, 3] = testAccuracy
cleanTrain = cbind(eXtrain, eYtrain)
cleanTest = cbind(eXtest, eYtest)
}