| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| ################################################################################################################## |
| # This script will read the dirty and clean data, then it will apply the best pipeline on dirty data |
| # and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset |
| # in terms of classification accuracy |
| |
| # Vocab = original data -> dataset without any noise, the original version with ground truths |
| # cleaned data -> dirty dataset cleaned by pipeline |
| # read the items |
| # dirty dataset F |
| # clean dataset O |
| # metadata (schema and mask) |
| # best k pipelines and hyperparameters generated by previous script mainScript.dml |
| |
| # do the initial preprocessing like dropping invalid values so that pipeline could fix them |
| # then recode the data to bring it into matrix format |
| # then construct the hyper-parameters list and call the executePipeline() on the dirty dataset |
| # for the comparison OHE the original dataset, there is no need to OHE the cleaned dataset because cleaning pipeline |
| # has a primitive for this |
| # Call the multilogReg on both of the datasets and compare accuracy on k=3 cross validation |
| ###################################################################################################################### |
| |
| source("scripts/pipelines/scripts/utils.dml") as utils; |
| |
| |
| F = read($1, data_type="frame", format="csv", header=TRUE, |
| naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]); |
| metaInfo = read($2, data_type="frame", format="csv", header=FALSE); |
| input = $3 |
| pip = read(input+"pip.csv", data_type="frame", format="csv", header=FALSE); |
| applyFunc = read(input+"applyFunc.csv", data_type="frame", format="csv", header=FALSE); |
| hp = read(input+"hp.csv", data_type="matrix", format="csv", header=FALSE); |
| evalHp = read(input+"evalHp.csv", data_type="matrix", format="csv", header=FALSE); |
| # dirtyScore = read(input+"dirtyScore.csv", data_type="scalar", value_type="double"); |
| trainTestSplit = as.double($5) |
| metaInfo = metaInfo[, 2:ncol(metaInfo)] |
| |
| split = nrow(F) * trainTestSplit |
| trainData = F[1:split,] |
| testData = F[split+1:nrow(F),] |
| |
| |
| print("pipeline: "+toString(pip[1])) |
| [result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], 3, "evalClassification", evalHp, TRUE, FALSE) |
| eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE) |
| |
| |
| result = as.frame(result) |
| resultBool = as.scalar(result[1, 3] > result[1, 1]) |
| eXtest = replace(target=eXtest, pattern=NaN, replacement=0) |
| tsX = replace(target=tsX, pattern=NaN, replacement=0) |
| |
| |
| resApply = sum(eXtest[51:111] - tsX[51:111, 1:ncol(eXtest)]) == 0 |
| percent = sum(eXtest-tsX[, 1:ncol(eXtest)] > 0) / (nrow(eXtest) * ncol(eXtest)) |
| errorMargin = percent < 0.05 |
| resultBool = resultBool & errorMargin |
| |
| write(resultBool, $6) |
| |
| header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3) |
| writeRes = rbind(header, result) |
| print(toString(writeRes)) |
| |
| # UDF for evaluation |
| # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) |
| evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), |
| Matrix[Double] evalFunHp) |
| return(Matrix[Double] output, Matrix[Double] error) |
| { |
| if(is.na(as.scalar(evalFunHp[1,1]))) |
| { |
| nc = max(Y); |
| params = list("icpt", "reg", "tol", "maxi") |
| paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5), 10^seq(1,3)); |
| trainArgs = list(X=X, y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=100, maxii=-1, verbose=FALSE); |
| [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1), |
| params=params, paramValues=paramRanges, trainArgs=trainArgs, cv=TRUE, cv=3, verbose=FALSE); |
| evalFunHp = as.matrix(opt) |
| } |
| if(min(Y) == max(Y)) |
| { |
| accuracy = as.matrix(0) |
| a = 0 |
| } |
| else { |
| beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), |
| maxi=1000, maxii=0, verbose=FALSE); |
| [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) |
| error = yhat != Ytest |
| a = getAccuracy(Ytest, yhat, TRUE) |
| accuracy = as.matrix(accuracy) |
| print("accuracy: "+toString(accuracy)) |
| } |
| output = cbind(accuracy, evalFunHp) |
| } |
| |
| accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { |
| [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE); |
| err = as.matrix(1-(acc/100)); |
| } |