[SYSTEMDS-2659] imputeByFD now accepts the matrix input The initial version of imputeByFD accepts the frame input then internally recodes the frame and performs imputations. Now, the method accepts the matrix input (recoded matrix for non-numeric data) and directly perform imputations on matrix values.

commit: 36eaaeb961130471c7d8f19456a7848312ff25b5 [log] [tgz]
author: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at> Tue Sep 01 22:47:36 2020 +0200
committer: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at> Tue Sep 01 22:47:36 2020 +0200
tree: a25db26c44d1a169a922a9304c914cbe1614597b
parent: b3ef333a164a279abeccad50ca0fab268a308a3e [diff]
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 8ad523a..01281d2 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml

@@ -25,7 +25,7 @@
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
-# F               String    --       Data frame
+# X               Double    --       Matrix X 
 # source          Integer   --       source attribute to use for imputation and error correction
 # target          Integer   --       attribute to be fixed
 # threshold       Double    --       threshold value in interval [0, 1] for robust FDs 
@@ -36,39 +36,21 @@
 # ---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # ---------------------------------------------------------------------------------------------
-# imputed_F      String   ---        Frame with possible imputations 
+# X               Double   ---        Matrix with possible imputations 
 
 
-s_imputeByFD = function(Frame[String] F, Integer sourceAttribute, Integer targetAttribute, Double threshold)
-  return(Frame[String] imputed_F)
+m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold)
+  return(Matrix[Double] X)
 {
-
   # sanity checks
   if( threshold < 0 | threshold > 1 )
     stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)
 
-  if(sourceAttribute < 0 | sourceAttribute > ncol(F) | targetAttribute < 0 | targetAttribute > ncol(F))
+  if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 | targetAttribute > ncol(X))
     stop("Stopping due to invalid source and target")
-
-
-  # detect schema for transformation
-  schema = detectSchema(F)
-  s=""
-  for(i in 1: ncol(F)) {
-    if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" )
-      s = s+as.integer(i)+","; 
-  }
-  
-  # recode data frame
-  jspecR = "{ids:true, recode:["+s+"]}";
-  [X, M] = transformencode(target=F, spec=jspecR);
-
+ 
   # impute missing values and fix errors
   X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold) 
-
-  # getting the actual data back
-  dF = transformdecode(target=X, spec=jspecR, meta=M);
-  imputed_F = dF;
 }
 
 imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)

diff --git a/src/test/scripts/functions/builtin/imputeFD.dml b/src/test/scripts/functions/builtin/imputeFD.dml
index 9325562..4782921 100644
--- a/src/test/scripts/functions/builtin/imputeFD.dml
+++ b/src/test/scripts/functions/builtin/imputeFD.dml

@@ -16,6 +16,24 @@
 #
 #-------------------------------------------------------------
 
-X = read($1, data_type="frame", format="csv", header=FALSE);
+F = read($1, data_type="frame", format="csv", header=FALSE);
+# as the method accepts the matrix so convert the non-numeric data into matrix
+
+# detect schema for transformation
+schema = detectSchema(F)
+s=""
+for(i in 1: ncol(F)) {
+  if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" )
+    s = s+as.integer(i)+","; 
+}
+  
+# recode data frame
+jspecR = "{ids:true, recode:["+s+"]}";
+[X, M] = transformencode(target=F, spec=jspecR);
+# call the method
 Y = imputeByFD(X, $2, $3, $4);
-write(Y, $5, format="binary")
+
+# getting the actual data back
+dF = transformdecode(target=Y, spec=jspecR, meta=M);
+
+write(dF, $5, format="binary")
commit	36eaaeb961130471c7d8f19456a7848312ff25b5	[log] [tgz]
author	Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>	Tue Sep 01 22:47:36 2020 +0200
committer	Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>	Tue Sep 01 22:47:36 2020 +0200
tree	a25db26c44d1a169a922a9304c914cbe1614597b
parent	b3ef333a164a279abeccad50ca0fab268a308a3e [diff]