[SYSTEMDS-2659] imputeByFD now accepts the matrix input
The initial version of imputeByFD accepts the frame input then internally recodes the
frame and performs imputations. Now, the method accepts the matrix input
(recoded matrix for non-numeric data) and directly perform imputations on matrix values.
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 8ad523a..01281d2 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml
@@ -25,7 +25,7 @@
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# F String -- Data frame
+# X Double -- Matrix X
# source Integer -- source attribute to use for imputation and error correction
# target Integer -- attribute to be fixed
# threshold Double -- threshold value in interval [0, 1] for robust FDs
@@ -36,39 +36,21 @@
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# imputed_F String --- Frame with possible imputations
+# X Double --- Matrix with possible imputations
-s_imputeByFD = function(Frame[String] F, Integer sourceAttribute, Integer targetAttribute, Double threshold)
- return(Frame[String] imputed_F)
+m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold)
+ return(Matrix[Double] X)
{
-
# sanity checks
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)
- if(sourceAttribute < 0 | sourceAttribute > ncol(F) | targetAttribute < 0 | targetAttribute > ncol(F))
+ if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 | targetAttribute > ncol(X))
stop("Stopping due to invalid source and target")
-
-
- # detect schema for transformation
- schema = detectSchema(F)
- s=""
- for(i in 1: ncol(F)) {
- if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" )
- s = s+as.integer(i)+",";
- }
-
- # recode data frame
- jspecR = "{ids:true, recode:["+s+"]}";
- [X, M] = transformencode(target=F, spec=jspecR);
-
+
# impute missing values and fix errors
X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute], X[,targetAttribute], threshold)
-
- # getting the actual data back
- dF = transformdecode(target=X, spec=jspecR, meta=M);
- imputed_F = dF;
}
imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double threshold)
diff --git a/src/test/scripts/functions/builtin/imputeFD.dml b/src/test/scripts/functions/builtin/imputeFD.dml
index 9325562..4782921 100644
--- a/src/test/scripts/functions/builtin/imputeFD.dml
+++ b/src/test/scripts/functions/builtin/imputeFD.dml
@@ -16,6 +16,24 @@
#
#-------------------------------------------------------------
-X = read($1, data_type="frame", format="csv", header=FALSE);
+F = read($1, data_type="frame", format="csv", header=FALSE);
+# as the method accepts the matrix so convert the non-numeric data into matrix
+
+# detect schema for transformation
+schema = detectSchema(F)
+s=""
+for(i in 1: ncol(F)) {
+ if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" )
+ s = s+as.integer(i)+",";
+}
+
+# recode data frame
+jspecR = "{ids:true, recode:["+s+"]}";
+[X, M] = transformencode(target=F, spec=jspecR);
+# call the method
Y = imputeByFD(X, $2, $3, $4);
-write(Y, $5, format="binary")
+
+# getting the actual data back
+dF = transformdecode(target=Y, spec=jspecR, meta=M);
+
+write(dF, $5, format="binary")