src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 # Generate the logical pipelines for data cleaning

 source("scripts/pipelines/scripts/utils.dml") as utils;

 # read the inputs
 F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
   naStrings= ["NA", "null","  ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999", "99999.00"]);
 metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
 primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
 param = read($parameters, data_type = "frame", format="csv", header= TRUE)
 topK = $topk
 resources = $rv
 expectedIncrease=$expectedIncrease
 sample=$sample
 max_iter=$max_iter
 output=$output
 testCV = as.logical($testCV)
 cvk = as.integer($cvk)
 trainTestSplit = as.double($split)
 evalFunc = "evalClassification"
 split = nrow(F) * trainTestSplit
 if(testCV) {

   trainData = F
   testData = frame("", rows=0, cols=0)
 }
 else {

   trainData = F[1:split,]
   testData = F[split+1:nrow(F),]
 }

 if(nrow(metaInfo) < 2)
   stop("incomplete meta info")

 metaInfo = metaInfo[, 2:ncol(metaInfo)]
 # # # split in train/test 70/30

 [topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
   refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
   evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE,
   expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE)

 write(topKPipelines, output+"/pip.csv", format="csv")
 write(topKHyperParams, output+"/hp.csv", format="csv")
 write(topKScores, output+"/bestAcc.csv", format="csv")
 write(baseLineScore, output+"/dirtyScore.csv", format="csv")
 write(evalFunHp, output+"/evalHp.csv", format="csv")
 write(applyFunc, output+"/applyFunc.csv", format="csv")
 result = baseLineScore < as.scalar(topKScores[1, 1])
 write(result, $O)


 # UDF for evaluation
 # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
   Matrix[Double] evalFunHp)
 return(Matrix[Double] output, Matrix[Double] error)
 {
   if(is.na(as.scalar(evalFunHp[1,1])))
   {
     nc = max(Y);
     params = list("icpt", "reg", "tol")
     paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5));
     trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=1000, maxii=-1, verbose=FALSE);
     dataArgs = list("X", "Y");
     # [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
       # params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE);
     # evalFunHp = as.matrix(opt) # opt #
     opt = matrix("2 10 0.001", rows=1, cols=3)
     evalFunHp = opt
   }
   if(min(Y) == max(Y))
   {
     accuracy = as.matrix(0)
     a = 0
   }
   else {
     beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
       maxi=1000, maxii=0, verbose=FALSE);
     [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
     error = yhat != Ytest
     accuracy = as.matrix(accuracy)
   }
   output = cbind(accuracy, evalFunHp)
 }

 accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
   err = as.matrix(1-(acc/100));
 }


 evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
   Matrix[Double] evalFunHp)
 return(Matrix[Double] output, Matrix[Double] error)
 {
   if(is.na(as.scalar(evalFunHp[1,1])))
   {
     nc = max(Y);
     params = list("intercept", "reg", "epsilon")
     paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5));
     trainArgs = list(X=X, Y=Y, intercept=-1, reg=-1, epsilon=-1, maxIterations=1000,  verbose=FALSE);
     dataArgs = list("X", "Y");
     [B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc),
       params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=FALSE);
     evalFunHp = as.matrix(opt) # opt #
     # opt = matrix("2 10 0.001", rows=1, cols=3)
     # evalFunHp = opt
   }
   if(min(Y) == max(Y))
   {
     accuracy = as.matrix(0)
     a = 0
   }
   else {
     beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]),
       maxIterations=1000, verbose=FALSE);
     yhat = msvmPredict(X=Xtest, W=beta);
     yhat = rowIndexMax(yhat)
     accuracy = mean(yhat == Ytest) * 100
     error = yhat != Ytest
     accuracy = as.matrix(accuracy)
   }
   output = cbind(accuracy, evalFunHp)
 }
 accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   yhat = msvmPredict(X=X, W=B);
   yhat = rowIndexMax(yhat)
   acc = mean(yhat == y)
   err = as.matrix(1-(acc));
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	# Generate the logical pipelines for data cleaning

	source("scripts/pipelines/scripts/utils.dml") as utils;

	# read the inputs
	F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
	naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999", "99999.00"]);
	metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
	primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
	param = read($parameters, data_type = "frame", format="csv", header= TRUE)
	topK = $topk
	resources = $rv
	expectedIncrease=$expectedIncrease
	sample=$sample
	max_iter=$max_iter
	output=$output
	testCV = as.logical($testCV)
	cvk = as.integer($cvk)
	trainTestSplit = as.double($split)
	evalFunc = "evalClassification"
	split = nrow(F) * trainTestSplit
	if(testCV) {

	trainData = F
	testData = frame("", rows=0, cols=0)
	}
	else {

	trainData = F[1:split,]
	testData = F[split+1:nrow(F),]
	}

	if(nrow(metaInfo) < 2)
	stop("incomplete meta info")

	metaInfo = metaInfo[, 2:ncol(metaInfo)]
	# # # split in train/test 70/30

	[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
	refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
	evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE,
	expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE)

	write(topKPipelines, output+"/pip.csv", format="csv")
	write(topKHyperParams, output+"/hp.csv", format="csv")
	write(topKScores, output+"/bestAcc.csv", format="csv")
	write(baseLineScore, output+"/dirtyScore.csv", format="csv")
	write(evalFunHp, output+"/evalHp.csv", format="csv")
	write(applyFunc, output+"/applyFunc.csv", format="csv")
	result = baseLineScore < as.scalar(topKScores[1, 1])
	write(result, $O)


	# UDF for evaluation
	# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
	evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
	Matrix[Double] evalFunHp)
	return(Matrix[Double] output, Matrix[Double] error)
	{
	if(is.na(as.scalar(evalFunHp[1,1])))
	{
	nc = max(Y);
	params = list("icpt", "reg", "tol")
	paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5));
	trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=1000, maxii=-1, verbose=FALSE);
	dataArgs = list("X", "Y");
	# [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1),
	# params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE);
	# evalFunHp = as.matrix(opt) # opt #
	opt = matrix("2 10 0.001", rows=1, cols=3)
	evalFunHp = opt
	}
	if(min(Y) == max(Y))
	{
	accuracy = as.matrix(0)
	a = 0
	}
	else {
	beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
	maxi=1000, maxii=0, verbose=FALSE);
	[prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
	error = yhat != Ytest
	accuracy = as.matrix(accuracy)
	}
	output = cbind(accuracy, evalFunHp)
	}

	accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
	[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
	err = as.matrix(1-(acc/100));
	}


	evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
	Matrix[Double] evalFunHp)
	return(Matrix[Double] output, Matrix[Double] error)
	{
	if(is.na(as.scalar(evalFunHp[1,1])))
	{
	nc = max(Y);
	params = list("intercept", "reg", "epsilon")
	paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5));
	trainArgs = list(X=X, Y=Y, intercept=-1, reg=-1, epsilon=-1, maxIterations=1000, verbose=FALSE);
	dataArgs = list("X", "Y");
	[B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc),
	params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=FALSE);
	evalFunHp = as.matrix(opt) # opt #
	# opt = matrix("2 10 0.001", rows=1, cols=3)
	# evalFunHp = opt
	}
	if(min(Y) == max(Y))
	{
	accuracy = as.matrix(0)
	a = 0
	}
	else {
	beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]),
	maxIterations=1000, verbose=FALSE);
	yhat = msvmPredict(X=Xtest, W=beta);
	yhat = rowIndexMax(yhat)
	accuracy = mean(yhat == Ytest) * 100
	error = yhat != Ytest
	accuracy = as.matrix(accuracy)
	}
	output = cbind(accuracy, evalFunHp)
	}
	accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
	yhat = msvmPredict(X=X, W=B);
	yhat = rowIndexMax(yhat)
	acc = mean(yhat == y)
	err = as.matrix(1-(acc));
	}