blob: 8256ffd485331695c9c25cd392652633e1a32680 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Builtin to perform random under sampling on data.
#
# INPUT:
# -------------------------------------------------------------------------------------------
# X X data to sample from
# Y Y data to sample from it will sample the same rows from x.
# ratio The ratio to sample
# -------------------------------------------------------------------------------------------
#
# OUTPUT:
# -----------------------------------------------------------------------------------------
# X The under sample data X
# Y The under sample data Y
# -----------------------------------------------------------------------------------------
m_underSampling = function(Matrix[Double] X, Matrix[Double] Y, Double ratio)
return(Matrix[Double] X, Matrix[Double] Y)
{
if(ratio < 0 | ratio > 0.5) {
ratio = 0.1
print("ratio should be greater than 0 and less than 0.5 setting ratio = 0.1")
}
# # get the minority class
classes = table(Y, 1)
# # # get the minority class
maxClass = as.scalar(rowIndexMax(t(classes)))
# # # separate the minority class
notMin = (Y == maxClass)
dX = seq(1, nrow(X))
majority = removeEmpty(target=dX, margin="rows", select=notMin)
# # # formulate the undersampling ratio
# take the samples for undersampling
u_select = rand(rows=nrow(majority), cols=1, min=1, max=2, sparsity=(ratio), seed=1)
u_select = u_select > 0
u_select = u_select * majority
u_select = removeEmpty(target = u_select, margin = "rows")
u_select1 = table(u_select, 1, 1, nrow(X), 1)
sel = (u_select1 == 0)
X = removeEmpty(target=X, margin="rows", select = sel)
Y = removeEmpty(target=Y, margin="rows", select = sel)
}