blob: bb1d86bce879de7b8916a66f20f4d79186bb56d0 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This functions split input data X and Y into contiguous balanced ratio
# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
#
# INPUT:
# --------------------------------------------------------------------------------------------
# X Input feature matrix
# Y Input Labels
# f Train set fraction [0,1]
# verbose print available
# --------------------------------------------------------------------------------------------
#
# OUTPUT:
# ---------------------------------------------------------------------------------------------
# X_train Train split of feature matrix
# X_test Test split of feature matrix
# y_train Train split of label matrix
# y_test Test split of label matrix
# ---------------------------------------------------------------------------------------------
m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double splitRatio = 0.7, Boolean verbose = FALSE)
return (Matrix[Double] X_train, Matrix[Double] y_train, Matrix[Double] X_test,
Matrix[Double] y_test)
{
XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE, index.return=FALSE)
# get the class count
classes = table(XY[, 1], 1)
split = floor(nrow(X) * splitRatio)
start_class = 1
train_row_s = 1
test_row_s = 1
train_row_e = 0
test_row_e = 0
end_class = 0
outTrain = matrix(0, split+nrow(classes), ncol(XY))
outTest = matrix(0, (nrow(X) - split)+nrow(classes), ncol(XY))
classes_ratio_train = floor(classes*splitRatio)
classes_ratio_test = classes - classes_ratio_train
if(verbose) {
print("rows "+nrow(X))
print("classes \n"+toString(classes))
print("train ratio \n"+toString(classes_ratio_train))
print("test ratio \n"+toString(classes_ratio_test))
}
for(i in 1:nrow(classes))
{
end_class = end_class + as.scalar(classes[i])
class_t = XY[start_class:end_class, ]
train_row_e = train_row_e + as.scalar(classes_ratio_train[i])
test_row_e = test_row_e + as.scalar(classes_ratio_test[i])
outTrain[train_row_s:train_row_e, ] = class_t[1:as.scalar(classes_ratio_train[i]), ]
outTest[test_row_s:test_row_e, ] = class_t[as.scalar(classes_ratio_train[i])+1:nrow(class_t), ]
train_row_s = train_row_e + 1
test_row_s = test_row_e + 1
start_class = end_class + 1
}
outTrain = removeEmpty(target = outTrain, margin = "rows")
outTest = removeEmpty(target = outTest, margin = "rows")
y_train = outTrain[, 1]
X_train = outTrain[, 2:ncol(outTrain)]
y_test = outTest[, 1]
X_test = outTest[, 2:ncol(outTest)]
}