blob: 7b816d9a4ef1019fce48b20077cd0ed725ba9df7 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Generate the logical pipelines for data cleaning
source("scripts/pipelines/scripts/utils.dml") as utils;
# read the inputs
F = read($dirtyData, data_type="frame", format="csv", header=TRUE,
naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]);
metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE);
primitives = read($primitives, data_type = "frame", format="csv", header= TRUE)
param = read($parameters, data_type = "frame", format="csv", header= TRUE)
topK = $topk
resources = $rv
sample=$sample
if(nrow(metaInfo) < 2)
stop("incomplete meta info")
metaInfo = metaInfo[, 2:ncol(metaInfo)-1]
[topKPipelines, topKHyperParams, topKScores, features, dirtyScore] = handsOffCleaning(F, metaInfo, primitives, param,
matrix("4 0.7 1", rows=1, cols=3), "evalClassification", as.matrix("0"), 5, 20, sample, TRUE)
print("this is accuracies "+toString(topKScores))
result = dirtyScore < as.scalar(topKScores[1, 1])
write(result, $O)
# UDF for evaluation
# choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xorig, List[Unknown] metaList,
Matrix[Double] evalFunHp, Integer trainML=0)
return(Matrix[Double] output)
{
cv = 2
mask = as.matrix(metaList['mask'])
if(max(Y) == min(Y)) {
print("Y contains only one class")
accuracy = as.double(0)
}
else {
if(trainML == 1)
{
# do the gridsearch for hyper-parameters
params = list("icpt", "reg", "tol", "maxii")
paramRanges = list(seq(0, 2, 1), 10^seq(1,-4), 10^seq(1,-6), 10^seq(1,3));
if(sum(mask) > 0)
X = utils::dummycoding(replace(target = X, pattern = NaN, replacement=0), mask)
trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=-1, maxi=100, maxii=-1, verbose=FALSE);
[B1, opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="W", numB=ncol(X)+1, cv=TRUE, cvk=cv,
params=params, paramValues=paramRanges, trainArgs=trainArgs, verbose=TRUE);
evalFunHp = as.matrix(opt)
}
# do the k = 3 cross validations
# evalFunHpM = as.matrix(evalFunHp)
[accuracyMatrix] = crossV(X, Y, cv, evalFunHp, FALSE)
accuracyMatrix = removeEmpty(target=accuracyMatrix, margin="rows")
score = mean(accuracyMatrix)
print(cv +" validation accuracy "+score)
}
output = cbind(as.matrix(score), evalFunHp)
}
# # ######################################################################
# # # # Function for cross validation using hold out method
# # # # Inputs: The input dataset X, Y and the value of k validation, mask of the
# # # # dataset for OHE of categorical columns, vector of ML hyper-parameters identified
# # # # via gridsearch and a boolean value of (un)weighted accuracy.
# # # # Output: It return a matrix having the accuracy of each fold.
# # ######################################################################
crossV = function(Matrix[double] X, Matrix[double] y, Integer k, Matrix[Double] MLhp, Boolean isWeighted)
return (Matrix[Double] accuracyMatrix)
{
accuracyMatrix = matrix(0, k, 1)
print("ML HP in CV "+toString(MLhp))
dataList = list()
testL = list()
data = order(target = cbind(y, X), by = 1, decreasing=FALSE, index.return=FALSE)
classes = table(data[, 1], 1)
ins_per_fold = classes/k
start_fold = matrix(1, rows=nrow(ins_per_fold), cols=1)
fold_idxes = cbind(start_fold, ins_per_fold)
start_i = 0; end_i = 0; idx_fold = 1;;
for(i in 1:k)
{
fold_i = matrix(0, 0, ncol(data))
start=0; end=0;
for(j in 1:nrow(classes))
{
idx = as.scalar(classes[j, 1])
start = end + 1;
end = end + idx
class_j = data[start:end, ]
start_i = as.scalar(fold_idxes[j, 1]);
end_i = as.scalar(fold_idxes[j, 2])
fold_i = rbind(fold_i, class_j[start_i:end_i, ])
}
dataList = append(dataList, fold_i)
fold_idxes[, 1] = fold_idxes[, 2] + 1
fold_idxes[, 2] += ins_per_fold
}
for(i in seq(1,k))
{
[trainList, hold_out] = remove(dataList, i)
trainset = rbind(trainList)
testset = as.matrix(hold_out)
trainX = trainset[, 2:ncol(trainset)]
trainy = trainset[, 1]
testX = testset[, 2:ncol(testset)]
testy = testset[, 1]
beta = multiLogReg(X=trainX, Y=trainy, icpt=as.scalar(MLhp[1,1]), reg=as.scalar(MLhp[1,2]), tol=as.scalar(MLhp[1,3]),
maxi=as.scalar(MLhp[1,4]), maxii=50, verbose=FALSE);
[prob, yhat, a] = multiLogRegPredict(testX, beta, testy, FALSE)
accuracy = getAccuracy(testy, yhat, isWeighted)
accuracyMatrix[i] = accuracy
}
}