scripts/builtin/smote.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------


 # Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # X               Double   ---       Matrix of minority class samples
 # s               Integer   25       Amount of SMOTE (percentage of oversampling), integral multiple of 100
 # k               Integer   1        Number of nearest neighbour
 # ---------------------------------------------------------------------------------------------


 #Output(s)
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
 # Y               Double   ---       Matrix of (N/100)-1 * nrow(X) synthetic minority class samples

 m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
 return (Matrix[Double] Y) {

   if(s < 100 | (s%%100) != 0)
   {
     print("the number of samples should be an integral multiple of 100. Setting s = 100")
     s = 100
   }

   if(k < 1) {
     print("k should not be less than 1. Setting k value to default k = 1.")
     k = 1
   }

   # matrix to keep the index of KNN for each minority sample
   knn_index = matrix(0,k,nrow(X))
   # find nearest neighbour
   parfor(i in 1:nrow(X))
   {
     knn = nn(X, X[i, ], k)
     knn_index[, i] = knn
   }

   # number of synthetic samples from each minority class sample
   iter = 0
   iterLim = (s/100)
   # matrix to store synthetic samples
   synthetic_samples = matrix(0, iterLim*ncol(knn_index), ncol(X))

   # shuffle the nn indexes
   rand_index =  ifelse(k < iterLim, sample(k, iterLim, TRUE, 42), sample(k, iterLim, 42))

   while(iter < iterLim)
   {
     # pick the random NN
     knn_sample = knn_index[as.scalar(rand_index[iter+1]),]
     # generate sample
     for(i in 1:ncol(knn_index))
     {
       index = as.scalar(knn_sample[1,i])
       X_diff = X[index,] - X[i, ]
       gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
       X_sys = X[i, ] + (gap*X_diff)
       synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;
     }
     iter = iter + 1
   }

   Y = synthetic_samples

   if(verbose)
     print(nrow(Y)+ " synthesized samples generated.")

 }


 nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
 return (Matrix[Double] knn_)
 {
   if(nrow(X) < k)
     stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")

   # compute the euclidean distance
   diff = X - instance
   square_diff = diff^2
   distance = sqrt(rowSums(square_diff))
   sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return =  TRUE)
   knn_ = sort_dist[2:k+1,]
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------


	# Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Double --- Matrix of minority class samples
	# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
	# k Integer 1 Number of nearest neighbour
	# ---------------------------------------------------------------------------------------------


	#Output(s)
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# Y Double --- Matrix of (N/100)-1 * nrow(X) synthetic minority class samples

	m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
	return (Matrix[Double] Y) {

	if(s < 100 \| (s%%100) != 0)
	{
	print("the number of samples should be an integral multiple of 100. Setting s = 100")
	s = 100
	}

	if(k < 1) {
	print("k should not be less than 1. Setting k value to default k = 1.")
	k = 1
	}

	# matrix to keep the index of KNN for each minority sample
	knn_index = matrix(0,k,nrow(X))
	# find nearest neighbour
	parfor(i in 1:nrow(X))
	{
	knn = nn(X, X[i, ], k)
	knn_index[, i] = knn
	}

	# number of synthetic samples from each minority class sample
	iter = 0
	iterLim = (s/100)
	# matrix to store synthetic samples
	synthetic_samples = matrix(0, iterLim*ncol(knn_index), ncol(X))

	# shuffle the nn indexes
	rand_index = ifelse(k < iterLim, sample(k, iterLim, TRUE, 42), sample(k, iterLim, 42))

	while(iter < iterLim)
	{
	# pick the random NN
	knn_sample = knn_index[as.scalar(rand_index[iter+1]),]
	# generate sample
	for(i in 1:ncol(knn_index))
	{
	index = as.scalar(knn_sample[1,i])
	X_diff = X[index,] - X[i, ]
	gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
	X_sys = X[i, ] + (gap*X_diff)
	synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;
	}
	iter = iter + 1
	}

	Y = synthetic_samples

	if(verbose)
	print(nrow(Y)+ " synthesized samples generated.")

	}



	nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
	return (Matrix[Double] knn_)
	{
	if(nrow(X) < k)
	stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")

	# compute the euclidean distance
	diff = X - instance
	square_diff = diff^2
	distance = sqrt(rowSums(square_diff))
	sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return = TRUE)
	knn_ = sort_dist[2:k+1,]
	}