src/test/scripts/functions/pipelines/topkLogicalTest.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 # Generate the logical pipelines for data cleaning

 source("scripts/pipelines/scripts/utils.dml") as utils;
 source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;


 # read the inputs
 X = read($dirtyData, data_type="frame", format="csv", header=TRUE,
   naStrings= ["NA", "null","  ","NaN", "nan", "", "?", "99999"]);

 metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
 primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
 param = read($parameters, data_type = "frame", format="csv", header= TRUE)
 dirtyScore = $dirtyScore

 max_iter = $max_iter
 dirtyScore = $dirtyScore
 expectedIncrease=$expectedIncrease
 trainTestSplit = 0.7
 getSchema = metaInfo[1, 2:ncol(metaInfo)]
 getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
 getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation

 # encode the categorical data
 if(sum(getMask) > 0)
 {
   # always recode the label
   index = vectorToCsv(getMask)
   jspecR = "{ids:true, recode:["+index+"]}"
   [eX, X_meta] = transformencode(target=X, spec=jspecR);
   # change the schema to reflect the encoded values
   getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
   getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")

 }
 # if no categorical value exist then just cast the frame into matrix
 else
   eX = as.matrix(X)

 # extract the class label
 eY = eX[, ncol(eX)]
 eX = eX[, 1:ncol(eX) - 1]


 getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
 getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
 getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label

 metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20)

 logical =  frame([
                  "MVI",
                  "ED",
                  "OTLR",
                  "EC"
                  ], rows=4, cols=1)

 categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4)
 if(sum(getMask) > 0)
 {
   dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1)
   logical = cbind(logical, dummyEncode)
 }

 # doing holdout evaluation
 split = nrow(eX) * 0.7

 trainX = eX[1:split,]
 trainY = eY[1:split,]
 testX = eX[split+1:nrow(eX),]
 testY = eY[split+1:nrow(eX),]


 [bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY,
   initial_population=logical, seed = 42, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
   dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives,
   param=param, cv=FALSE, verbose=TRUE)


 print("bestLogical "+toString(bestLogical))
 # result = dirtyScore < score
 print("result satisfied ------------"+converged)

 write(converged , $O)


 # UDF for evaluation
 # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
   Matrix[Double] evalFunHp)

 return(Matrix[Double] accuracy)
 {
   beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
     maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
   [prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
   # accuracy = getAccuracy(Ytest, yhat, FALSE)
   print("accuracy weighted: "+a)
   accuracy = as.matrix(a)
 }

 accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=TRUE);
   err = as.matrix(1-(acc/100));
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	# Generate the logical pipelines for data cleaning

	source("scripts/pipelines/scripts/utils.dml") as utils;
	source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;


	# read the inputs
	X = read($dirtyData, data_type="frame", format="csv", header=TRUE,
	naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);

	metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
	primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
	param = read($parameters, data_type = "frame", format="csv", header= TRUE)
	dirtyScore = $dirtyScore

	max_iter = $max_iter
	dirtyScore = $dirtyScore
	expectedIncrease=$expectedIncrease
	trainTestSplit = 0.7
	getSchema = metaInfo[1, 2:ncol(metaInfo)]
	getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
	getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation

	# encode the categorical data
	if(sum(getMask) > 0)
	{
	# always recode the label
	index = vectorToCsv(getMask)
	jspecR = "{ids:true, recode:["+index+"]}"
	[eX, X_meta] = transformencode(target=X, spec=jspecR);
	# change the schema to reflect the encoded values
	getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
	getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")

	}
	# if no categorical value exist then just cast the frame into matrix
	else
	eX = as.matrix(X)

	# extract the class label
	eY = eX[, ncol(eX)]
	eX = eX[, 1:ncol(eX) - 1]


	getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
	getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
	getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label

	metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20)

	logical = frame([
	"MVI",
	"ED",
	"OTLR",
	"EC"
	], rows=4, cols=1)

	categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4)
	if(sum(getMask) > 0)
	{
	dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1)
	logical = cbind(logical, dummyEncode)
	}

	# doing holdout evaluation
	split = nrow(eX) * 0.7

	trainX = eX[1:split,]
	trainY = eY[1:split,]
	testX = eX[split+1:nrow(eX),]
	testY = eY[split+1:nrow(eX),]


	[bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY,
	initial_population=logical, seed = 42, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
	dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives,
	param=param, cv=FALSE, verbose=TRUE)


	print("bestLogical "+toString(bestLogical))
	# result = dirtyScore < score
	print("result satisfied ------------"+converged)

	write(converged , $O)


	# UDF for evaluation
	# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
	evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
	Matrix[Double] evalFunHp)

	return(Matrix[Double] accuracy)
	{
	beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
	maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
	[prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
	# accuracy = getAccuracy(Ytest, yhat, FALSE)
	print("accuracy weighted: "+a)
	accuracy = as.matrix(a)
	}

	accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
	[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=TRUE);
	err = as.matrix(1-(acc/100));
	}