scripts/builtin/smote.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------


 # Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
 # by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 16 (2002). 321–357
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double   ---       Matrix of minority class samples
 # mask             Double   ---       0/1 mask vector where 0 represent numeric value and 1 represent categorical value
 # s               Integer   25       Amount of SMOTE (percentage of oversampling), integral multiple of 100
 # k               Integer   1        Number of nearest neighbour
 # ---------------------------------------------------------------------------------------------


 #Output(s)
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # Y               Double   ---       Matrix of (N/100)-1 * nrow(X) synthetic minority class samples

 m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
 return (Matrix[Double] Y) {

   if(s < 100 | (s%%100) != 0)
   {
     print("the number of samples should be an integral multiple of 100. Setting s = 100")
     s = 100
   }
   if(k < 1) {
     print("k should not be less than 1. Setting k value to default k = 1.")
     k = 1
   }
   if(ncol(mask) != ncol(X))
     stop("column mismatch: no. of columns in mask vector should be equal to no. of columns in data matrix")

   # matrix to keep the index of KNN for each minority sample
   knn_index = matrix(0,k,nrow(X))
   # find nearest neighbour
   for(i in 1:nrow(X))
   {
     knn = nn(X, X[i, ], mask, k)
     knn_index[, i] = knn
   }

   # number of synthetic samples from each minority class sample
   iter = 0
   iterLim = (s/100)
   # matrix to store synthetic samples
   synthetic_samples = matrix(0, iterLim*ncol(knn_index), ncol(X))

   # shuffle the nn indexes
   #rand_index =  ifelse(k < iterLim, sample(k, iterLim, TRUE, 42), sample(k, iterLim, 42))
   if (k < iterLim)
     rand_index = sample(k, iterLim, TRUE, 42);
   else
     rand_index = sample(k, iterLim, 42);

   while(iter < iterLim)
   {
     # pick the random NN
     knn_sample = knn_index[as.scalar(rand_index[iter+1]),]
     # generate sample
     for(i in 1:ncol(knn_index)) {
       index = as.scalar(knn_sample[1,i])

       X_diff = X[index,] - X[i, ]
       gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
       # generate synthetic sample
       X_sys = X[i, ] + (gap*X_diff)
       # for nominal features replace their value with majority voting
       if(sum(mask) > 0) {
         categorical = X_sys * mask
         # get all nn values
         computation_matrix = table(knn_index[,i], knn_index[, i], nrow(X), nrow(X))
         nn_X = computation_matrix %*% X
         nn_X = removeEmpty(target=nn_X, margin = "rows")
         nn_X = nn_X * mask
         freq = getFrequentValue(nn_X)
         categorical = (categorical > 0) * freq
         X_sys = X_sys * (mask == 0)
         X_sys = X_sys + categorical
       }
       synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;

     }
     iter = iter + 1
   }

   Y = synthetic_samples

   if(verbose)
     print(nrow(Y)+ " synthesized samples generated.")

 }

 # as described in the paper, fr categorical columns compute the difference by replacing the
 # categorical values with the median of standard deviation of numerical values

 nn = function(Matrix[Double] X, Matrix[Double] instance, Matrix[Double] mask, Integer k )
 return (Matrix[Double] knn_)
 {
   if(nrow(X) < k)
     stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")

   diff = X - instance
   diff_nominal  = diff * mask
   if(sum(diff_nominal) != 0) {
     only_number = removeEmpty(target=X, margin="cols", select=(mask==0))
     num_std = colSds(only_number)
     num_std_median = median(t(num_std))
     diff_nominal = (diff_nominal != 0)
     diff_nominal = diff_nominal * num_std_median
     diff = diff_nominal + (diff * (mask==0))
   }
   square_diff = diff^2
   distance = sqrt(rowSums(square_diff))
   sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return =  TRUE)
   knn_ = sort_dist[2:k+1,]
 }

 getFrequentValue = function(Matrix[Double] X)
 return (Matrix[Double] freq)
 {
   freq = matrix(0, rows=1, cols=ncol(X))
   for(i in 1:ncol(X))
   {
     if(sum(X[, i]) != 0) {
       cat_counts = table(X[, i], 1, nrow(X), 1);  # counts for each category
       freq[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
     }
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------


	# Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
	# by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 16 (2002). 321–357
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double --- Matrix of minority class samples
	# mask Double --- 0/1 mask vector where 0 represent numeric value and 1 represent categorical value
	# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
	# k Integer 1 Number of nearest neighbour
	# ---------------------------------------------------------------------------------------------


	#Output(s)
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# Y Double --- Matrix of (N/100)-1 * nrow(X) synthetic minority class samples

	m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
	return (Matrix[Double] Y) {

	if(s < 100 \| (s%%100) != 0)
	{
	print("the number of samples should be an integral multiple of 100. Setting s = 100")
	s = 100
	}
	if(k < 1) {
	print("k should not be less than 1. Setting k value to default k = 1.")
	k = 1
	}
	if(ncol(mask) != ncol(X))
	stop("column mismatch: no. of columns in mask vector should be equal to no. of columns in data matrix")

	# matrix to keep the index of KNN for each minority sample
	knn_index = matrix(0,k,nrow(X))
	# find nearest neighbour
	for(i in 1:nrow(X))
	{
	knn = nn(X, X[i, ], mask, k)
	knn_index[, i] = knn
	}

	# number of synthetic samples from each minority class sample
	iter = 0
	iterLim = (s/100)
	# matrix to store synthetic samples
	synthetic_samples = matrix(0, iterLim*ncol(knn_index), ncol(X))

	# shuffle the nn indexes
	#rand_index = ifelse(k < iterLim, sample(k, iterLim, TRUE, 42), sample(k, iterLim, 42))
	if (k < iterLim)
	rand_index = sample(k, iterLim, TRUE, 42);
	else
	rand_index = sample(k, iterLim, 42);

	while(iter < iterLim)
	{
	# pick the random NN
	knn_sample = knn_index[as.scalar(rand_index[iter+1]),]
	# generate sample
	for(i in 1:ncol(knn_index)) {
	index = as.scalar(knn_sample[1,i])

	X_diff = X[index,] - X[i, ]
	gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
	# generate synthetic sample
	X_sys = X[i, ] + (gap*X_diff)
	# for nominal features replace their value with majority voting
	if(sum(mask) > 0) {
	categorical = X_sys * mask
	# get all nn values
	computation_matrix = table(knn_index[,i], knn_index[, i], nrow(X), nrow(X))
	nn_X = computation_matrix %*% X
	nn_X = removeEmpty(target=nn_X, margin = "rows")
	nn_X = nn_X * mask
	freq = getFrequentValue(nn_X)
	categorical = (categorical > 0) * freq
	X_sys = X_sys * (mask == 0)
	X_sys = X_sys + categorical
	}
	synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;

	}
	iter = iter + 1
	}

	Y = synthetic_samples

	if(verbose)
	print(nrow(Y)+ " synthesized samples generated.")

	}

	# as described in the paper, fr categorical columns compute the difference by replacing the
	# categorical values with the median of standard deviation of numerical values

	nn = function(Matrix[Double] X, Matrix[Double] instance, Matrix[Double] mask, Integer k )
	return (Matrix[Double] knn_)
	{
	if(nrow(X) < k)
	stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")

	diff = X - instance
	diff_nominal = diff * mask
	if(sum(diff_nominal) != 0) {
	only_number = removeEmpty(target=X, margin="cols", select=(mask==0))
	num_std = colSds(only_number)
	num_std_median = median(t(num_std))
	diff_nominal = (diff_nominal != 0)
	diff_nominal = diff_nominal * num_std_median
	diff = diff_nominal + (diff * (mask==0))
	}
	square_diff = diff^2
	distance = sqrt(rowSums(square_diff))
	sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return = TRUE)
	knn_ = sort_dist[2:k+1,]
	}

	getFrequentValue = function(Matrix[Double] X)
	return (Matrix[Double] freq)
	{
	freq = matrix(0, rows=1, cols=ncol(X))
	for(i in 1:ncol(X))
	{
	if(sum(X[, i]) != 0) {
	cat_counts = table(X[, i], 1, nrow(X), 1); # counts for each category
	freq[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
	}
	}
	}