src/test/scripts/functions/pipelines/fit_pipelineTest.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 ##################################################################################################################
 # This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
 # and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
 # in terms of classification accuracy

 # Vocab = original data -> dataset without any noise, the original version with ground truths
         # cleaned data -> dirty dataset cleaned by pipeline
 # read the items
 # dirty dataset F
 # clean dataset O
 # metadata (schema and mask)
 # best k pipelines and hyperparameters generated by previous script mainScript.dml

 # do the initial preprocessing like dropping invalid values so that pipeline could fix them
 # then recode the data to bring it into matrix format
 # then construct the hyper-parameters list and call the executePipeline() on the dirty dataset
 # for the comparison OHE the original dataset, there is no need to OHE the cleaned dataset because cleaning pipeline
 # has a primitive for this
 # Call the multilogReg on both of the datasets and compare accuracy on k=3 cross validation
 ######################################################################################################################

 source("scripts/pipelines/scripts/utils.dml") as utils;


 F = read($1, data_type="frame", format="csv", header=TRUE,
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);
 metaInfo = read($2, data_type="frame", format="csv", header=FALSE);
 input = $3
 pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE);
 applyFunc = read(input+"applyFunc.csv", data_type="frame", format="csv", header=FALSE);
 hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE);
 evalHp = read(input+"evalHp.csv", data_type="matrix", format="csv", header=FALSE);
 # dirtyScore = read(input+"dirtyScore.csv", data_type="scalar", value_type="double");
 trainTestSplit = as.double($5)
 metaInfo = metaInfo[, 2:ncol(metaInfo)]

 split = nrow(F) * trainTestSplit
 trainData = F[1:split,]
 testData = F[split+1:nrow(F),]


 print("pipeline: "+toString(pip[1]))
 [result, trX, tsX, exState, iState]  = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE)
 eXtest  = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE)


 result = as.frame(result)
 resultBool = as.scalar(result[1, 3] > result[1, 1])
 eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
 tsX = replace(target=tsX, pattern=NaN, replacement=0)


 resApply = sum(eXtest[51:111] - tsX[51:111, 1:ncol(eXtest)]) == 0
 percent = sum(eXtest-tsX[, 1:ncol(eXtest)] > 0) / (nrow(eXtest) * ncol(eXtest))
 errorMargin = percent < 0.05
 resultBool = resultBool & errorMargin

 write(resultBool, $6)

 header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
 writeRes = rbind(header, result)
 print(toString(writeRes))

 # UDF for evaluation
 # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
   Matrix[Double] evalFunHp)
 return(Matrix[Double] output, Matrix[Double] error)
 {
   if(is.na(as.scalar(evalFunHp[1,1])))
   {
     nc = max(Y);
     params = list("icpt", "reg", "tol", "maxi")
     paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5), 10^seq(1,3));
     trainArgs = list(X=X, y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=100, maxii=-1, verbose=FALSE);
     [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
       params=params, paramValues=paramRanges, trainArgs=trainArgs, cv=TRUE, cv=3, verbose=FALSE);
     evalFunHp = as.matrix(opt)
   }
   if(min(Y) == max(Y))
   {
     accuracy = as.matrix(0)
     a = 0
   }
   else {
     beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
       maxi=1000, maxii=0, verbose=FALSE);
     [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
     error = yhat != Ytest
     a = getAccuracy(Ytest, yhat, TRUE)
     accuracy = as.matrix(accuracy)
     print("accuracy: "+toString(accuracy))
   }
   output = cbind(accuracy, evalFunHp)
 }

 accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
   err = as.matrix(1-(acc/100));
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	##################################################################################################################
	# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
	# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
	# in terms of classification accuracy

	# Vocab = original data -> dataset without any noise, the original version with ground truths
	# cleaned data -> dirty dataset cleaned by pipeline
	# read the items
	# dirty dataset F
	# clean dataset O
	# metadata (schema and mask)
	# best k pipelines and hyperparameters generated by previous script mainScript.dml

	# do the initial preprocessing like dropping invalid values so that pipeline could fix them
	# then recode the data to bring it into matrix format
	# then construct the hyper-parameters list and call the executePipeline() on the dirty dataset
	# for the comparison OHE the original dataset, there is no need to OHE the cleaned dataset because cleaning pipeline
	# has a primitive for this
	# Call the multilogReg on both of the datasets and compare accuracy on k=3 cross validation
	######################################################################################################################

	source("scripts/pipelines/scripts/utils.dml") as utils;


	F = read($1, data_type="frame", format="csv", header=TRUE,
	naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
	metaInfo = read($2, data_type="frame", format="csv", header=FALSE);
	input = $3
	pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE);
	applyFunc = read(input+"applyFunc.csv", data_type="frame", format="csv", header=FALSE);
	hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE);
	evalHp = read(input+"evalHp.csv", data_type="matrix", format="csv", header=FALSE);
	# dirtyScore = read(input+"dirtyScore.csv", data_type="scalar", value_type="double");
	trainTestSplit = as.double($5)
	metaInfo = metaInfo[, 2:ncol(metaInfo)]

	split = nrow(F) * trainTestSplit
	trainData = F[1:split,]
	testData = F[split+1:nrow(F),]


	print("pipeline: "+toString(pip[1]))
	[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE)
	eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE)


	result = as.frame(result)
	resultBool = as.scalar(result[1, 3] > result[1, 1])
	eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
	tsX = replace(target=tsX, pattern=NaN, replacement=0)


	resApply = sum(eXtest[51:111] - tsX[51:111, 1:ncol(eXtest)]) == 0
	percent = sum(eXtest-tsX[, 1:ncol(eXtest)] > 0) / (nrow(eXtest) * ncol(eXtest))
	errorMargin = percent < 0.05
	resultBool = resultBool & errorMargin

	write(resultBool, $6)

	header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
	writeRes = rbind(header, result)
	print(toString(writeRes))

	# UDF for evaluation
	# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
	evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
	Matrix[Double] evalFunHp)
	return(Matrix[Double] output, Matrix[Double] error)
	{
	if(is.na(as.scalar(evalFunHp[1,1])))
	{
	nc = max(Y);
	params = list("icpt", "reg", "tol", "maxi")
	paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5), 10^seq(1,3));
	trainArgs = list(X=X, y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=100, maxii=-1, verbose=FALSE);
	[B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
	params=params, paramValues=paramRanges, trainArgs=trainArgs, cv=TRUE, cv=3, verbose=FALSE);
	evalFunHp = as.matrix(opt)
	}
	if(min(Y) == max(Y))
	{
	accuracy = as.matrix(0)
	a = 0
	}
	else {
	beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
	maxi=1000, maxii=0, verbose=FALSE);
	[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
	error = yhat != Ytest
	a = getAccuracy(Ytest, yhat, TRUE)
	accuracy = as.matrix(accuracy)
	print("accuracy: "+toString(accuracy))
	}
	output = cbind(accuracy, evalFunHp)
	}

	accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
	[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
	err = as.matrix(1-(acc/100));
	}