blob: 6169dbff29930256295fed5d3cbca0749d95227b [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# UNDERSAMPLING TECHNIQUE;
# COMPUTES TOMEK LINKS AND DROPS THEM FROM DATA MATRIX AND LABEL VECTOR
# DROPS ONLY THE MAJORITY LABEL AND CORRESPONDING POINT OF TOMEK LINKS
#
# INPUT PARAMETERS:
# ------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ------------------------------------------------------------
# X MATRIX --- Data Matrix (nxm)
# y MATRIX --- Label Matrix (nx1), greater than zero
# ------------------------------------------------------------
# OUTPUT:
# X_under - Data Matrix without Tomek links
# y_under - Labels corresponding to undersampled data
# drop_idx - Indices of dropped rows/labels wrt input
m_tomeklink = function(Matrix[Double] X, Matrix[Double] y)
return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx)
{
ymin = min(y)
if(ymin == 0)
y = y + 1
# # find the majority labels
label = table(y, 1)
majority_label = as.scalar(rowIndexMax(t(label)))
tomek_links = get_links(X, y, majority_label)
drop_idx = tomek_links * seq(1, nrow(X))
if(sum(tomek_links == 0) > 0)
{
X_under = removeEmpty(target=X, margin="rows", select = (tomek_links == 0))
y_under = removeEmpty(target=y, margin="rows", select = (tomek_links == 0))
drop_idx = removeEmpty(target=drop_idx, margin="rows", select = tomek_links)
}
else
{
X_under = X
y_under = y
drop_idx = as.matrix(NaN)
}
if(ymin == 0)
y_under = y_under - 1
}
# get the nearest neighbour index
get_nn = function(Matrix[Double] X)
return (Matrix[Double] nn) {
# TODO exchange manhatten by euclidean dist()?
nn = matrix(0, rows = nrow(X), cols = 1)
for (i in 1:nrow(X)) {
dists = rowSums((X - X[i,])^2)
dists[i,] = NaN; # mask out self-ref
nn[i, 1] = rowIndexMin(t(dists))
}
}
# find the tomek links
get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
return (Matrix[Double] tomek_links) {
nn = get_nn(X)
perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
nn_labels = perm %*% y
links = (y != majority_label) & (nn_labels == majority_label)
tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
}