scripts/builtin/fit_pipeline.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
 # and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
 # in terms of classification accuracy
 #
 # INPUT:
 # -------------------------------------------------------------------------------
 # trainData        ---
 # testData         ---
 # metaData         ---
 # lp               ---
 # pip              ---
 # hp               ---
 # evaluationFunc   ---
 # evalFunHp        ---
 # isLastLabel      ---
 # correctTypos     ---
 # -------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------------------------
 # scores    ---
 # ------------------------------------------------------------------------------------------------

 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/builtin/topk_cleaning.dml") as topk;
 source("scripts/builtin/bandit.dml") as bandit;

 s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
   Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
   Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
 return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
 {
   externalState = list()
   no_of_flag_vars = 5
   [schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData)

   pip = removeEmpty(target=pip, margin="cols")
   applyFunc = removeEmpty(target=applyFunc, margin="cols")
   metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
   ctx = list(prefix="----"); #TODO include seed
   # separate the label
   [Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel)
   [Xtest, Ytest] = topk::getLabel(testData, isLastLabel)

   # always recode the label
   if(maskY == 1) {
     [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
     eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
     externalState = append(externalState, M)
   }
   else
   {
     eYtrain = as.matrix(Ytrain)
     eYtest = as.matrix(Ytest)
   }
     # # # when the evaluation function is called first we also compute and keep hyperparams of target application
   ctx = list(prefix="evaluate Pipeline")
   dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx)
   [Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)

   # # # if mask has 1s then there are categorical features
   [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
   externalState = append(externalState, M1)
   # # # do the early dropping
   # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
   metaList["applyFunc"] = applyFunc

   no_of_param = as.scalar(hp[1, 1]) + 1
   hp_width= hp[1, 2:no_of_param]
   hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
   pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)

   [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
       pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
   print("train score cv: "+toString(trainScore))


   # # # now test accuracy
   [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
     Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)

   if(max(eYtrain) == min(eYtrain))
     stop("Y contains only one class")

   # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
   # trainAccuracy = as.scalar(score[1, 1])

   score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
   testAccuracy = as.scalar(score[1, 1])

   scores = matrix(0, rows=1, cols=3)
   scores[1, 1] = dirtyScore
   # scores[1, 2] = trainAccuracy
   scores[1, 3] = testAccuracy
   cleanTrain = cbind(eXtrain, eYtrain)
   cleanTest = cbind(eXtest, eYtest)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
	# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
	# in terms of classification accuracy
	#
	# INPUT:
	# -------------------------------------------------------------------------------
	# trainData ---
	# testData ---
	# metaData ---
	# lp ---
	# pip ---
	# hp ---
	# evaluationFunc ---
	# evalFunHp ---
	# isLastLabel ---
	# correctTypos ---
	# -------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------------------------
	# scores ---
	# ------------------------------------------------------------------------------------------------

	source("scripts/pipelines/scripts/utils.dml") as utils;
	source("scripts/builtin/topk_cleaning.dml") as topk;
	source("scripts/builtin/bandit.dml") as bandit;

	s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
	Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
	Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
	return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
	{
	externalState = list()
	no_of_flag_vars = 5
	[schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData)

	pip = removeEmpty(target=pip, margin="cols")
	applyFunc = removeEmpty(target=applyFunc, margin="cols")
	metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
	ctx = list(prefix="----"); #TODO include seed
	# separate the label
	[Xtrain, Ytrain] = topk::getLabel(trainData, isLastLabel)
	[Xtest, Ytest] = topk::getLabel(testData, isLastLabel)

	# always recode the label
	if(maskY == 1) {
	[eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
	eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
	externalState = append(externalState, M)
	}
	else
	{
	eYtrain = as.matrix(Ytrain)
	eYtest = as.matrix(Ytest)
	}
	# # # when the evaluation function is called first we also compute and keep hyperparams of target application
	ctx = list(prefix="evaluate Pipeline")
	dirtyScore = topk::getDirtyScore(X=Xtrain, Y=eYtrain, Xtest=Xtest, Ytest=eYtest, metaList=metaList,
	evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, ctx=ctx)
	[Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)

	# # # if mask has 1s then there are categorical features
	[eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
	externalState = append(externalState, M1)
	# # # do the early dropping
	# [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
	metaList["applyFunc"] = applyFunc

	no_of_param = as.scalar(hp[1, 1]) + 1
	hp_width= hp[1, 2:no_of_param]
	hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
	pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)

	[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
	pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
	print("train score cv: "+toString(trainScore))


	# # # now test accuracy
	[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
	Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)

	if(max(eYtrain) == min(eYtrain))
	stop("Y contains only one class")

	# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
	# trainAccuracy = as.scalar(score[1, 1])

	score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
	testAccuracy = as.scalar(score[1, 1])

	scores = matrix(0, rows=1, cols=3)
	scores[1, 1] = dirtyScore
	# scores[1, 2] = trainAccuracy
	scores[1, 3] = testAccuracy
	cleanTrain = cbind(eXtrain, eYtrain)
	cleanTest = cbind(eXtest, eYtest)
	}