blob: 3c6e70cd7bb255dee97241efcab4e09088063c3e [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Generate the logical pipelines for data cleaning
source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
# read the inputs
X = read($dirtyData, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
param = read($parameters, data_type = "frame", format="csv", header= TRUE)
dirtyScore = $dirtyScore
max_iter = $max_iter
dirtyScore = $dirtyScore
expectedIncrease=$expectedIncrease
trainTestSplit = 0.7
getSchema = metaInfo[1, 2:ncol(metaInfo)]
getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)])
getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation
# encode the categorical data
if(sum(getMask) > 0)
{
# always recode the label
index = vectorToCsv(getMask)
jspecR = "{ids:true, recode:["+index+"]}"
[eX, X_meta] = transformencode(target=X, spec=jspecR);
# change the schema to reflect the encoded values
getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")")
getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")")
}
# if no categorical value exist then just cast the frame into matrix
else
eX = as.matrix(X)
# extract the class label
eY = eX[, ncol(eX)]
eX = eX[, 1:ncol(eX) - 1]
getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label
getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label
getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label
metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20)
logical = frame([
"MVI",
"ED",
"OTLR",
"EC"
], rows=4, cols=1)
categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4)
if(sum(getMask) > 0)
{
dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1)
logical = cbind(logical, dummyEncode)
}
# doing holdout evaluation
split = nrow(eX) * 0.7
trainX = eX[1:split,]
trainY = eY[1:split,]
testX = eX[split+1:nrow(eX),]
testY = eY[split+1:nrow(eX),]
[bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY,
initial_population=logical, seed = 42, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML",
dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives,
param=param, cv=FALSE, verbose=TRUE)
print("bestLogical "+toString(bestLogical))
# result = dirtyScore < score
print("result satisfied ------------"+converged)
write(converged , $O)
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
Matrix[Double] evalFunHp)
return(Matrix[Double] accuracy)
{
beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]),
maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE);
[prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE)
# accuracy = getAccuracy(Ytest, yhat, FALSE)
print("accuracy weighted: "+a)
accuracy = as.matrix(a)
}
accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
[M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=TRUE);
err = as.matrix(1-(acc/100));
}