| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # Generate the logical pipelines for data cleaning |
| |
| source("scripts/pipelines/scripts/utils.dml") as utils; |
| source("scripts/pipelines/scripts/enumerateLogical.dml") as lg; |
| |
| |
| # read the inputs |
| X = read($dirtyData, data_type="frame", format="csv", header=TRUE, |
| naStrings= ["NA", "null"," ","NaN", "nan", "", "?", "99999"]); |
| |
| metaInfo = read($metaData, data_type="frame", format="csv", header=FALSE); |
| primitives = read($primitives, data_type = "frame", format="csv", header= TRUE) |
| param = read($parameters, data_type = "frame", format="csv", header= TRUE) |
| dirtyScore = $dirtyScore |
| |
| max_iter = $max_iter |
| dirtyScore = $dirtyScore |
| expectedIncrease=$expectedIncrease |
| trainTestSplit = 0.7 |
| getSchema = metaInfo[1, 2:ncol(metaInfo)] |
| getMask = as.matrix(metaInfo[2, 2:ncol(metaInfo)]) |
| getFdMask = as.matrix(metaInfo[3, 2:ncol(metaInfo)]) # columns of interest for FD computation |
| |
| # encode the categorical data |
| if(sum(getMask) > 0) |
| { |
| # always recode the label |
| index = vectorToCsv(getMask) |
| jspecR = "{ids:true, recode:["+index+"]}" |
| [eX, X_meta] = transformencode(target=X, spec=jspecR); |
| # change the schema to reflect the encoded values |
| getSchema = map(getSchema, "x->x.replace(\"STRING\", \"INT64\")") |
| getSchema = map(getSchema, "x->x.replace(\"BOOLEAN\", \"INT64\")") |
| |
| } |
| # if no categorical value exist then just cast the frame into matrix |
| else |
| eX = as.matrix(X) |
| |
| # extract the class label |
| eY = eX[, ncol(eX)] |
| eX = eX[, 1:ncol(eX) - 1] |
| |
| |
| getMask = getMask[, 1:ncol(getMask) - 1] # strip the mask of class label |
| getFdMask = getFdMask[, 1:ncol(getFdMask) - 1] # strip the mask of class label |
| getSchema = getSchema[, 1:ncol(getSchema) - 1] # strip the mask of class label |
| |
| metaList = list(mask=getMask, schema=getSchema, fd=as.matrix(0), applyFunc=as.frame("NULL"), distY = 20) |
| |
| logical = frame([ |
| "MVI", |
| "ED", |
| "OTLR", |
| "EC" |
| ], rows=4, cols=1) |
| |
| categories = frame(["ED", "MVI", "OTLR", "EC"], rows=1, cols=4) |
| if(sum(getMask) > 0) |
| { |
| dummyEncode = frame("DUMMY", rows=nrow(logical), cols=1) |
| logical = cbind(logical, dummyEncode) |
| } |
| |
| # doing holdout evaluation |
| split = nrow(eX) * 0.7 |
| |
| trainX = eX[1:split,] |
| trainY = eY[1:split,] |
| testX = eX[split+1:nrow(eX),] |
| testY = eY[split+1:nrow(eX),] |
| |
| |
| [bestLogical, bestHp, converged] = lg::enumerateLogical(X=trainX, y=trainY, Xtest=testX, ytest=testY, |
| initial_population=logical, seed = 42, max_iter=max_iter, metaList = metaList, evaluationFunc="evalML", |
| dirtyScore = dirtyScore + expectedIncrease, evalFunHp=matrix("1 1e-3 1e-9 100", rows=1, cols=4), primitives=primitives, |
| param=param, cv=FALSE, verbose=TRUE) |
| |
| |
| print("bestLogical "+toString(bestLogical)) |
| # result = dirtyScore < score |
| print("result satisfied ------------"+converged) |
| |
| write(converged , $O) |
| |
| |
| # UDF for evaluation |
| # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) |
| evalML = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), |
| Matrix[Double] evalFunHp) |
| |
| return(Matrix[Double] accuracy) |
| { |
| beta = multiLogReg(X=X, Y=Y, icpt=as.scalar(evalFunHp[1,1]), reg=as.scalar(evalFunHp[1,2]), tol=as.scalar(evalFunHp[1,3]), |
| maxi=as.scalar(evalFunHp[1,4]), maxii=50, verbose=FALSE); |
| [prob, yhat, a] = multiLogRegPredict(Xtest, beta, Ytest, FALSE) |
| # accuracy = getAccuracy(Ytest, yhat, FALSE) |
| print("accuracy weighted: "+a) |
| accuracy = as.matrix(a) |
| } |
| |
| accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { |
| [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=TRUE); |
| err = as.matrix(1-(acc/100)); |
| } |