[SYSTEMDS-2658] Synthetic Minority Over-sampling Technique (SMOTE)
Technique for handling unbalance classes by oversampling the minority class
Date: Wed Sep 2 14:16:32 2020 +0200
Closes #988
diff --git a/scripts/builtin/smote.dml b/scripts/builtin/smote.dml
new file mode 100644
index 0000000..14947ea
--- /dev/null
+++ b/scripts/builtin/smote.dml
@@ -0,0 +1,99 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X Double --- Matrix of minority class samples
+# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
+# k Integer 1 Number of nearest neighbour
+# ---------------------------------------------------------------------------------------------
+
+
+#Output(s)
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# Y Double --- Matrix of (N/100)-1 * nrow(X) synthetic minority class samples
+
+m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
+return (Matrix[Double] Y) {
+
+ if(s < 100 | (s%%100) != 0)
+ {
+ print("the number of samples should be an integral multiple of 100. Setting s = 100")
+ s = 100
+ }
+ # matrix to keep the index of KNN for each minority sample
+ knn_index = matrix(0,k,0)
+ # find nearest neighbour
+ for(i in 1:nrow(X))
+ {
+ knn = nn(X, X[i, ], k)
+ knn_index = cbind(knn_index, knn)
+ }
+
+ # number of synthetic samples from each minority class sample
+ iter = (s/100)-1
+ # matrix to store synthetic samples
+ synthetic_samples = matrix(0, 0, ncol(X))
+ while(iter > 0)
+ {
+ # generate a random number
+ # TODO avoid duplicate random numbers
+ rand_index = as.integer(as.scalar(Rand(rows=1, cols=1, min=1, max=k)))
+ # pick the random NN
+ knn_sample = knn_index[rand_index,]
+ # generate sample
+ for(i in 1:ncol(knn_index))
+ {
+ index = as.scalar(knn_sample[1,i])
+ X_diff = X[index,] - X[i, ]
+ gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1))
+ X_sys = X[i, ] + (gap*X_diff)
+ synthetic_samples = rbind(synthetic_samples, X_sys)
+ }
+ iter = iter - 1
+ }
+
+ Y = synthetic_samples
+}
+
+
+
+nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
+return (Matrix[Double] knn_)
+{
+ if(nrow(X) < k)
+ stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total instances")
+
+ # compute the euclidean distance
+ diff = X - instance
+ square_diff = diff^2
+ distance = sqrt(rowSums(square_diff))
+ sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return = TRUE)
+ knn_ = sort_dist[2:k+1,]
+}
+
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 20fad72..28acc5b 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -177,6 +177,7 @@
SINH("sinh", false),
STEPLM("steplm",true, ReturnType.MULTI_RETURN),
SLICEFINDER("slicefinder", true),
+ SMOTE("smote", true),
SOLVE("solve", false),
SQRT("sqrt", false),
SUM("sum", false),
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
new file mode 100644
index 0000000..675e741
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinSmoteTest extends AutomatedTestBase {
+
+ private final static String TEST_NAME = "smote";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinSmoteTest.class.getSimpleName() + "/";
+
+ private final static int rows = 20;
+ private final static int colsX = 20;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"C"}));
+ }
+
+ @Test
+ public void testSmote1CP() {
+ runSmoteTest(300, 3, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testSmote2CP() {
+ runSmoteTest(400, 5, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testSmote1Spark() {
+ runSmoteTest(300, 3, LopProperties.ExecType.SPARK);
+ }
+
+ @Test
+ public void testSmote2Spark() { runSmoteTest(400, 5, LopProperties.ExecType.SPARK); }
+
+
+ private void runSmoteTest(int sample, int nn, LopProperties.ExecType instType) {
+ Types.ExecMode platformOld = setExecMode(instType);
+
+ boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+ boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+ OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[] {"-nvargs", "X=" + input("X"), "S=" + sample, "K=" + nn , "Z="+output("Sum"), "T="+input("T")};
+
+ double[][] X = getRandomMatrix(rows, colsX, 0, 1, 0.3, 1);
+
+ writeInputMatrixWithMTD("X", X, true);
+
+ double[][] T = getRandomMatrix(rows, colsX, 2, 3.0, 0.3, 3);
+
+ writeInputMatrixWithMTD("T", T, true);
+
+ runTest(true, false, null, -1);
+ HashMap<MatrixValue.CellIndex, Double> value = readDMLMatrixFromHDFS("Sum");
+ Assert.assertEquals("synthetic samples does not fall into minority class cluster",1,
+ value.get(new MatrixValue.CellIndex(1,1)), 0.000001);
+ }
+ finally {
+ rtplatform = platformOld;
+ DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+ OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+ OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+ OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+ }
+ }
+}
+
diff --git a/src/test/scripts/functions/builtin/smote.dml b/src/test/scripts/functions/builtin/smote.dml
new file mode 100644
index 0000000..dc33f18
--- /dev/null
+++ b/src/test/scripts/functions/builtin/smote.dml
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+A = read($X);
+B = smote(X = A, s=$S, k=$K);
+
+# test if all point fall in same cluster (closed to each other)
+# read some new data T != A
+T = read($T);
+# bind all instanced of minority class
+A_B = rbind(A, B)
+n = nrow(A_B)
+# group data into k=2 clusters
+[C, Y] = kmeans(rbind(A_B, T), 2, 10, 100, 0.000001, FALSE, 50)
+# check if the instances of A and B fall in same cluster
+check = matrix(as.scalar(Y[1,1]), n, 1)
+testSum = sum(check - Y[1:n,])
+# hack for avoiding null pointer exception while reading a single zero in HashMap
+testSum = ifelse(testSum == 0, 1, testSum)
+testSum = as.matrix(testSum)
+write(testSum, $Z);
\ No newline at end of file