| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This functions split input data X and Y into contiguous balanced ratio |
| # Related to [SYSTEMDS-2902] dependency function for cleaning pipelines |
| # |
| # INPUT: |
| # -------------------------------------------------------------------------------------------- |
| # X Input feature matrix |
| # Y Input Labels |
| # f Train set fraction [0,1] |
| # verbose print available |
| # -------------------------------------------------------------------------------------------- |
| # |
| # OUTPUT: |
| # --------------------------------------------------------------------------------------------- |
| # X_train Train split of feature matrix |
| # X_test Test split of feature matrix |
| # y_train Train split of label matrix |
| # y_test Test split of label matrix |
| # --------------------------------------------------------------------------------------------- |
| |
| m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double splitRatio = 0.7, Boolean verbose = FALSE) |
| return (Matrix[Double] X_train, Matrix[Double] y_train, Matrix[Double] X_test, |
| Matrix[Double] y_test) |
| { |
| |
| XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE, index.return=FALSE) |
| # get the class count |
| classes = table(XY[, 1], 1) |
| split = floor(nrow(X) * splitRatio) |
| start_class = 1 |
| train_row_s = 1 |
| test_row_s = 1 |
| train_row_e = 0 |
| test_row_e = 0 |
| end_class = 0 |
| |
| outTrain = matrix(0, split+nrow(classes), ncol(XY)) |
| outTest = matrix(0, (nrow(X) - split)+nrow(classes), ncol(XY)) |
| |
| classes_ratio_train = floor(classes*splitRatio) |
| classes_ratio_test = classes - classes_ratio_train |
| if(verbose) { |
| print("rows "+nrow(X)) |
| print("classes \n"+toString(classes)) |
| print("train ratio \n"+toString(classes_ratio_train)) |
| print("test ratio \n"+toString(classes_ratio_test)) |
| } |
| |
| for(i in 1:nrow(classes)) |
| { |
| end_class = end_class + as.scalar(classes[i]) |
| class_t = XY[start_class:end_class, ] |
| |
| train_row_e = train_row_e + as.scalar(classes_ratio_train[i]) |
| test_row_e = test_row_e + as.scalar(classes_ratio_test[i]) |
| |
| outTrain[train_row_s:train_row_e, ] = class_t[1:as.scalar(classes_ratio_train[i]), ] |
| |
| outTest[test_row_s:test_row_e, ] = class_t[as.scalar(classes_ratio_train[i])+1:nrow(class_t), ] |
| |
| train_row_s = train_row_e + 1 |
| test_row_s = test_row_e + 1 |
| start_class = end_class + 1 |
| } |
| outTrain = removeEmpty(target = outTrain, margin = "rows") |
| outTest = removeEmpty(target = outTest, margin = "rows") |
| y_train = outTrain[, 1] |
| X_train = outTrain[, 2:ncol(outTrain)] |
| y_test = outTest[, 1] |
| X_test = outTest[, 2:ncol(outTest)] |
| |
| } |