| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # Generate the logical pipelines for data cleaning |
| |
| source("scripts/pipelines/scripts/utils.dml") as utils; |
| |
| # read the inputs |
| F = read($dirtyData, data_type="frame", format="csv", header=TRUE, |
| naStrings= ["NA", "null"," ","NaN", "nan", "", " ", "_nan_", "inf", "?", "NAN", "99999", "99999.00"]); |
| metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE); |
| primitives = read($primitives, data_type = "frame", format="csv", header= TRUE) |
| param = read($parameters, data_type = "frame", format="csv", header= TRUE) |
| topK = $topk |
| resources = $rv |
| expectedIncrease=$expectedIncrease |
| sample=$sample |
| max_iter=$max_iter |
| output=$output |
| testCV = as.logical($testCV) |
| cvk = as.integer($cvk) |
| trainTestSplit = as.double($split) |
| evalFunc = "evalClassification" |
| split = nrow(F) * trainTestSplit |
| if(testCV) { |
| |
| trainData = F |
| testData = frame("", rows=0, cols=0) |
| } |
| else { |
| |
| trainData = F[1:split,] |
| testData = F[split+1:nrow(F),] |
| } |
| |
| if(nrow(metaInfo) < 2) |
| stop("incomplete meta info") |
| |
| metaInfo = metaInfo[, 2:ncol(metaInfo)] |
| # # # split in train/test 70/30 |
| |
| [topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, |
| refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), |
| evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE, |
| expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) |
| |
| write(topKPipelines, output+"/pip.csv", format="csv") |
| write(topKHyperParams, output+"/hp.csv", format="csv") |
| write(topKScores, output+"/bestAcc.csv", format="csv") |
| write(baseLineScore, output+"/dirtyScore.csv", format="csv") |
| write(evalFunHp, output+"/evalHp.csv", format="csv") |
| write(applyFunc, output+"/applyFunc.csv", format="csv") |
| result = baseLineScore < as.scalar(topKScores[1, 1]) |
| write(result, $O) |
| |
| |
| # UDF for evaluation |
| # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) |
| evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), |
| Matrix[Double] evalFunHp) |
| return(Matrix[Double] output, Matrix[Double] error) |
| { |
| if(is.na(as.scalar(evalFunHp[1,1]))) |
| { |
| nc = max(Y); |
| params = list("icpt", "reg", "tol") |
| paramRanges = list(seq(0, 2, 1), 10^seq(1,-3), 10^seq(1,-5)); |
| trainArgs = list(X=X, Y=Y, icpt=-1, reg=-1, tol=1e-9, maxi=1000, maxii=-1, verbose=FALSE); |
| dataArgs = list("X", "Y"); |
| # [B1,opt] = gridSearch(X=X, y=Y, train="multiLogReg", predict="accuracy", numB=(ncol(X)+1)*(nc-1), |
| # params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE); |
| # evalFunHp = as.matrix(opt) # opt # |
| opt = matrix("2 10 0.001", rows=1, cols=3) |
| evalFunHp = opt |
| } |
| if(min(Y) == max(Y)) |
| { |
| accuracy = as.matrix(0) |
| a = 0 |
| } |
| else { |
| beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), |
| maxi=1000, maxii=0, verbose=FALSE); |
| [prob, yhat, accuracy] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) |
| error = yhat != Ytest |
| accuracy = as.matrix(accuracy) |
| } |
| output = cbind(accuracy, evalFunHp) |
| } |
| |
| accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { |
| [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE); |
| err = as.matrix(1-(acc/100)); |
| } |
| |
| |
| evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), |
| Matrix[Double] evalFunHp) |
| return(Matrix[Double] output, Matrix[Double] error) |
| { |
| if(is.na(as.scalar(evalFunHp[1,1]))) |
| { |
| nc = max(Y); |
| params = list("intercept", "reg", "epsilon") |
| paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5)); |
| trainArgs = list(X=X, Y=Y, intercept=-1, reg=-1, epsilon=-1, maxIterations=1000, verbose=FALSE); |
| dataArgs = list("X", "Y"); |
| [B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc), |
| params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=FALSE); |
| evalFunHp = as.matrix(opt) # opt # |
| # opt = matrix("2 10 0.001", rows=1, cols=3) |
| # evalFunHp = opt |
| } |
| if(min(Y) == max(Y)) |
| { |
| accuracy = as.matrix(0) |
| a = 0 |
| } |
| else { |
| beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]), |
| maxIterations=1000, verbose=FALSE); |
| yhat = msvmPredict(X=Xtest, W=beta); |
| yhat = rowIndexMax(yhat) |
| accuracy = mean(yhat == Ytest) * 100 |
| error = yhat != Ytest |
| accuracy = as.matrix(accuracy) |
| } |
| output = cbind(accuracy, evalFunHp) |
| } |
| accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { |
| yhat = msvmPredict(X=X, W=B); |
| yhat = rowIndexMax(yhat) |
| acc = mean(yhat == y) |
| err = as.matrix(1-(acc)); |
| } |