DL: Improve the predict output with prob (#514) * DL: Improve the predict output with prob JIRA: MADLIB-1451 The prob output for predict created a bunch of columns for different class values. This commit pivots this format so the table signature is the same but the number of rows is increased thanks to a separate row for each class value. In addition, this commit adds the option to filter the output by a min probability or top n results as well as a ranking column for non-response options.

commit: 38e399f2fa749e8b8d544a1de6e56666a410490c [log] [tgz]
author: Orhan Kislal <okislal@apache.org> Thu Aug 27 22:16:25 2020 -0400
committer: GitHub <noreply@github.com> Thu Aug 27 22:16:25 2020 -0400
tree: 89b8dfe0f640a06ea85b7205e7722787eb376b88
parent: f6fdef8acbed20c4fa47410f6f9bdc85340f40e0 [diff]
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index f4f03cf..2a26481 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in

@@ -1869,9 +1869,9 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR,
-    use_gpus                BOOLEAN,
-    mst_key                 INTEGER
+    pred_type               VARCHAR DEFAULT 'prob',
+    use_gpus                BOOLEAN DEFAULT FALSE,
+    mst_key                 INTEGER DEFAULT NULL
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
     from utilities.control import SetGUC
@@ -1895,12 +1895,25 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR,
-    use_gpus                BOOLEAN
+    pred_type               INTEGER,
+    use_gpus                BOOLEAN DEFAULT FALSE,
+    mst_key                 INTEGER DEFAULT NULL
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, $7, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.Predict(schema_madlib,
+                   model_table,
+                   test_table,
+                   id_col,
+                   independent_varname,
+                   output_table,
+                   pred_type,
+                   use_gpus,
+                   mst_key)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict(
     model_table             VARCHAR,
@@ -1908,28 +1921,30 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR
+    pred_type               DOUBLE PRECISION,
+    use_gpus                BOOLEAN DEFAULT FALSE,
+    mst_key                 INTEGER DEFAULT NULL
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, FALSE, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict(
-    model_table             VARCHAR,
-    test_table              VARCHAR,
-    id_col                  VARCHAR,
-    independent_varname     VARCHAR,
-    output_table            VARCHAR
-) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, NULL, FALSE, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.Predict(schema_madlib,
+                   model_table,
+                   test_table,
+                   id_col,
+                   independent_varname,
+                   output_table,
+                   pred_type,
+                   use_gpus,
+                   mst_key)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_predict(
     independent_var    REAL[],
     model_architecture TEXT,
     model_weights      BYTEA,
-    is_response        BOOLEAN,
     normalizing_const  DOUBLE PRECISION,
     current_seg_id     INTEGER,
     seg_ids            INTEGER[],
@@ -1951,10 +1966,10 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR,
-    use_gpus                BOOLEAN,
-    class_values            TEXT[],
-    normalizing_const       DOUBLE PRECISION
+    pred_type               VARCHAR DEFAULT 'prob',
+    use_gpus                BOOLEAN DEFAULT NULL,
+    class_values            TEXT[] DEFAULT NULL,
+    normalizing_const       DOUBLE PRECISION DEFAULT NULL
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
     from utilities.control import SetGUC
@@ -1971,15 +1986,19 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR,
-    use_gpus                BOOLEAN,
-    class_values            TEXT[]
+    pred_type               INTEGER,
+    use_gpus                BOOLEAN DEFAULT NULL,
+    class_values            TEXT[] DEFAULT NULL,
+    normalizing_const       DOUBLE PRECISION DEFAULT NULL
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL);
-$$ LANGUAGE sql VOLATILE
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.PredictBYOM(**globals())
+$$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom(
     model_arch_table        VARCHAR,
     model_id                INTEGER,
@@ -1987,39 +2006,18 @@
     id_col                  VARCHAR,
     independent_varname     VARCHAR,
     output_table            VARCHAR,
-    pred_type               VARCHAR,
-    use_gpus                BOOLEAN
+    pred_type               DOUBLE PRECISION,
+    use_gpus                BOOLEAN DEFAULT NULL,
+    class_values            TEXT[] DEFAULT NULL,
+    normalizing_const       DOUBLE PRECISION DEFAULT NULL
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL);
-$$ LANGUAGE sql VOLATILE
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.PredictBYOM(**globals())
+$$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom(
-    model_arch_table        VARCHAR,
-    model_id                INTEGER,
-    test_table              VARCHAR,
-    id_col                  VARCHAR,
-    independent_varname     VARCHAR,
-    output_table            VARCHAR,
-    pred_type               VARCHAR
-) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom(
-    model_arch_table        VARCHAR,
-    model_id                INTEGER,
-    test_table              VARCHAR,
-    id_col                  VARCHAR,
-    independent_varname     VARCHAR,
-    output_table            VARCHAR
-) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict_byom($1, $2, $3, $4, $5, $6, NULL, NULL, NULL, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
 -------------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate(
     model_table             VARCHAR,
@@ -2170,7 +2168,6 @@
 $$ LANGUAGE sql IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict(
      message VARCHAR
 ) RETURNS VARCHAR AS $$
@@ -2186,3 +2183,17 @@
 $$ LANGUAGE sql IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom(
+     message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    with AOControl(False):
+        return madlib_keras_predict.predict_byom_help(**globals())
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.madlib_keras_predict_byom('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 412e63b..0d542b2 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in

@@ -32,9 +32,9 @@
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
-from utilities.utilities import create_cols_from_array_sql_string
 from utilities.utilities import get_segments_per_host
 from utilities.utilities import unique_string
+from utilities.utilities import get_psql_type
 from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
 
@@ -65,28 +65,46 @@
         self._set_default_pred_type()
 
     def _set_default_pred_type(self):
-        self.pred_type =  'response' if not self.pred_type else self.pred_type
+        self.pred_type = 'prob' if self.pred_type is None else self.pred_type
         self.is_response = True if self.pred_type == 'response' else False
+        self.pred_type = 1 if self.is_response else self.pred_type
+        self.get_all = True if self.pred_type == 'prob' else False
+        self.use_ratio = True if self.pred_type < 1 else False
 
     def call_internal_keras(self):
-        if self.is_response:
-            pred_col_name = add_postfix("estimated_", self.dependent_varname)
-            pred_col_type = self.dependent_vartype
-        else:
-            pred_col_name = "prob"
-            pred_col_type = 'double precision'
 
-        intermediate_col = unique_string()
+        pred_col_name = 'prob'
+        pred_col_type = 'double precision'
+
         class_values = strip_trailing_nulls_from_class_values(self.class_values)
-
-        prediction_select_clause, create_table_columns = create_cols_from_array_sql_string(
-            class_values, intermediate_col, pred_col_name,
-            pred_col_type, self.is_response, self.module_name)
         gp_segment_id_col, seg_ids_test, \
         images_per_seg_test = get_image_count_per_seg_for_non_minibatched_data_from_db(
             self.test_table)
         segments_per_host = get_segments_per_host()
 
+        if self.pred_type == 1:
+            rank_create_sql = ""
+
+        self.pred_vartype = self.dependent_vartype.strip('[]')
+        unnest_sql = ''
+        if self.pred_vartype in ['text', 'character varying', 'varchar']:
+
+            unnest_sql = "unnest(ARRAY{0}) AS {1} , unnest".format(
+                ['NULL' if i is None else i for i in class_values],
+                self.dependent_varname)
+        else:
+
+            unnest_sql = "unnest(ARRAY[{0}]) AS {1} , unnest".format(
+                ','.join(['NULL' if i is None else str(i) for i in class_values]),
+                self.dependent_varname)
+
+        if self.get_all:
+            filter_sql = ""
+        elif self.use_ratio:
+            filter_sql = "WHERE {pred_col_name} > {self.pred_type}".format(**locals())
+        else:
+            filter_sql = "WHERE rank <= {self.pred_type}".format(**locals())
+
         select_segmentid_comma = ""
         group_by_clause = ""
         join_cond_on_segmentid = ""
@@ -99,54 +117,59 @@
         # guc codepath is called when passing in the weights
         plpy.execute("""
             CREATE TABLE {self.output_table}
-            ({self.id_col} {self.id_col_type}, {create_table_columns})
-            """.format(self=self, create_table_columns=create_table_columns))
+                ({self.id_col} {self.id_col_type},
+                 {self.dependent_varname} {self.pred_vartype},
+                 {pred_col_name} {pred_col_type},
+                 rank INTEGER)
+            """.format(**locals()))
         # Passing huge model weights to internal_keras_predict() for each row
         # resulted in slowness of overall madlib_keras_predict().
         # To avoid this, a CASE is added to pass the model weights only for
         # the very first row(min(ctid)) that is fetched on each segment and NULL
         # for the other rows.
+
         predict_query = plpy.prepare("""
             INSERT INTO {self.output_table}
-            SELECT {self.id_col}::{self.id_col_type}, {prediction_select_clause}
+            SELECT *
             FROM (
-                SELECT {self.test_table}.{self.id_col},
-                       ({self.schema_madlib}.internal_keras_predict
-                           ({self.independent_varname},
-                            $1,
-                            CASE WHEN {self.test_table}.ctid = min_ctid.ctid THEN $2 ELSE NULL END,
-                            {self.is_response},
-                            {self.normalizing_const},
-                            {gp_segment_id_col},
-                            ARRAY{seg_ids_test},
-                            ARRAY{images_per_seg_test},
-                            {self.use_gpus},
-                            {self.gpus_per_host},
-                            {segments_per_host})
-                       ) AS {intermediate_col}
-                FROM {self.test_table}
-                LEFT JOIN
-                    (SELECT {select_segmentid_comma} MIN({self.test_table}.ctid) AS ctid
-                     FROM {self.test_table}
-                     {group_by_clause}) min_ctid
-                ON {join_cond_on_segmentid} {self.test_table}.ctid=min_ctid.ctid
-            ) q
-            """.format(self=self, prediction_select_clause=prediction_select_clause,
-                       seg_ids_test=seg_ids_test,
-                       images_per_seg_test=images_per_seg_test,
-                       gp_segment_id_col=gp_segment_id_col,
-                       segments_per_host=segments_per_host,
-                       intermediate_col=intermediate_col,
-                       select_segmentid_comma=select_segmentid_comma,
-                       group_by_clause=group_by_clause,
-                       join_cond_on_segmentid=join_cond_on_segmentid),
-                                     ["text", "bytea"])
+                SELECT *, row_number() OVER (PARTITION BY {self.id_col}
+                                  ORDER BY {pred_col_name} DESC) AS rank
+                FROM (
+                    SELECT  {self.id_col}::{self.id_col_type},
+                            {unnest_sql}({self.schema_madlib}.internal_keras_predict
+                                ({self.independent_varname},
+                                $1,
+                                CASE WHEN {self.test_table}.ctid = min_ctid.ctid THEN $2 ELSE NULL END,
+                                {self.normalizing_const},
+                                {gp_segment_id_col},
+                                ARRAY{seg_ids_test},
+                                ARRAY{images_per_seg_test},
+                                {self.use_gpus},
+                                {self.gpus_per_host},
+                                {segments_per_host})
+                            ) AS {pred_col_name}
+                        FROM {self.test_table}
+                        LEFT JOIN
+                            (SELECT {select_segmentid_comma} MIN({self.test_table}.ctid) AS ctid
+                             FROM {self.test_table}
+                             {group_by_clause}) min_ctid
+                        ON {join_cond_on_segmentid} {self.test_table}.ctid=min_ctid.ctid
+                ) __subq1__
+            ) __subq2__
+            {filter_sql}
+            """.format(**locals()), ["text", "bytea"])
         plpy.execute(predict_query, [self.model_arch, self.model_weights])
 
+        if self.is_response:
+            # Drop the rank column since it is irrelevant
+            plpy.execute("""
+                ALTER TABLE {self.output_table}
+                DROP COLUMN rank
+                """.format(**locals()))
+
+
     def set_default_class_values(self, class_values):
         self.class_values = class_values
-        if self.pred_type == 'prob':
-            return
         if self.class_values is None:
             num_classes = get_num_classes(self.model_arch)
             self.class_values = range(0, num_classes)
@@ -224,10 +247,15 @@
         BasePredict.__init__(self, schema_madlib, model_arch_table,
                              test_table, id_col, independent_varname,
                              output_table, pred_type, use_gpus, self.module_name)
-        if self.is_response:
-            self.dependent_vartype = 'text'
+
+        if self.class_values:
+            self.dependent_vartype = get_psql_type(self.class_values[0])
         else:
-            self.dependent_vartype = 'double precision'
+            if self.pred_type == 1:
+                self.dependent_vartype = 'text'
+            else:
+                self.dependent_vartype = 'double precision'
+
         ## Set default values for norm const and class_values
         # use_gpus and pred_type are defaulted in base_predict's init
         self.normalizing_const = normalizing_const
@@ -262,7 +290,7 @@
             get_input_shape(self.model_arch), 1)
 
 def internal_keras_predict(independent_var, model_architecture, model_weights,
-                           is_response, normalizing_const, current_seg_id, seg_ids,
+                           normalizing_const, current_seg_id, seg_ids,
                            images_per_seg, use_gpus, gpus_per_host, segments_per_host,
                            **kwargs):
     SD = kwargs['SD']
@@ -286,22 +314,13 @@
         independent_var = expand_input_dims(independent_var)
         independent_var /= normalizing_const
 
-        if is_response:
-            with K.tf.device(device_name):
-                y_prob = model.predict(independent_var)
-                proba_argmax = y_prob.argmax(axis=-1)
-            # proba_argmax is a list with exactly one element in it. That element
-            # refers to the index containing the largest probability value in the
-            # output of Keras' predict function.
-            result = proba_argmax
-        else:
-            with K.tf.device(device_name):
-                probs = model.predict(independent_var)
-            # probs is a list containing a list of probability values, of all
-            # class levels. Since we are assuming each input is a single image,
-            # and not mini-batched, this list contains exactly one list in it,
-            # so return back the first list in probs.
-            result = probs[0]
+        with K.tf.device(device_name):
+            probs = model.predict(independent_var)
+        # probs is a list containing a list of probability values, of all
+        # class levels. Since we are assuming each input is a single image,
+        # and not mini-batched, this list contains exactly one list in it,
+        # so return back the first list in probs.
+        result = probs[0]
         total_images = get_image_count_per_seg_from_array(seg_ids.index(current_seg_id),
                                                           images_per_seg)
 
@@ -364,15 +383,11 @@
 -----------------------------------------------------------------------
 The output table ('output_table' above) contains the following columns:
 
-id:                 Gives the 'id' for each prediction, corresponding
-                    to each row from the test_table.
-estimated_COL_NAME: (For pred_type='response') The estimated class for
-                    classification, where COL_NAME is the name of the
-                    column to be predicted from test data.
-prob_CLASS:         (For pred_type='prob' for classification) The
-                    probability of a given class. There will be one
-                    column for each class in the training data.
-                    TODO change this
+id:                     Gives the 'id' for each prediction,
+                        corresponding to each row from the test_table.
+dependent_varname:      The estimated class.
+prob:                   The probability of a given class.
+rank:                   The rank of the estimation.
 """
     else:
         help_string = "No such option. Use {schema_madlib}.madlib_keras_predict()"
@@ -436,18 +451,11 @@
 -----------------------------------------------------------------------
 The output table ('output_table' above) contains the following columns:
 
-id:                 Gives the 'id' for each prediction, corresponding
-                    to each row from the test_table.
-estimated_dependent_var: (For pred_type='response') The estimated class for
-                    classification. If class_values is passed in as NULL, then we
-                    assume that the class labels are [0,1,2...,n] where n in the
-                    num of classes in the model architecture.
-prob_CLASS:         (For pred_type='prob' for classification) The
-                    probability of a given class.
-                    If class_values is passed in as NULL, we create just one column
-                    called 'prob' which is an array of probabilites of all the classes
-                    Otherwise if class_values is not NULL, then there will be one
-                    column for each class in the training data.
+id:                     Gives the 'id' for each prediction,
+                        corresponding to each row from the test_table.
+dependent_varname:      The estimated class.
+prob:                   The probability of a given class.
+rank:                   The rank of the estimation.
 """
     else:
         help_string = "No such option. Use {schema_madlib}.madlib_keras_predict_byom()"

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 8ced08b..0fac1bf 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in

@@ -85,9 +85,21 @@
 
     @staticmethod
     def validate_pred_type(module_name, pred_type, class_values):
-        if not pred_type in ['prob', 'response']:
-            plpy.error("{0}: Invalid value for pred_type param ({1}). Must be "\
-                "either response or prob.".format(module_name, pred_type))
+
+        error = False
+        if type(pred_type) == str:
+            if not pred_type in ['prob', 'response']:
+                error = True
+        elif type(pred_type) == int:
+            if pred_type <= 0:
+                error = True
+        else:
+            if pred_type < 0.0 or pred_type >= 1.0:
+                error = True
+        if error:
+            plpy.error("{0}: Invalid value for pred_type param ({1}). "\
+                "Must be integer>0, double precision in the range [0.0,1.0), "\
+                "'response' or 'prob'.".format(module_name, pred_type))
 
     @staticmethod
     def validate_input_shape(table, independent_varname, input_shape, offset,

diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict.sql_in
index 5a5b362..3d9d0d9 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict.sql_in

@@ -52,16 +52,16 @@
 SELECT assert(UPPER(pg_typeof(id)::TEXT) = 'INTEGER', 'id column should be INTEGER type')
     FROM cifar10_predict;
 
-SELECT assert(UPPER(pg_typeof(estimated_y)::TEXT) =
+SELECT assert(UPPER(pg_typeof(y)::TEXT) =
     'SMALLINT', 'prediction column should be SMALLINT type')
     FROM cifar10_predict;
 
 -- Validate correct number of rows returned.
-SELECT assert(COUNT(*)=2, 'Output table of madlib_keras_predict should have two rows')
+SELECT assert(COUNT(*)=4, 'Output table of madlib_keras_predict should have two rows')
 FROM cifar10_predict;
 
 -- First test that all values are in set of class values; if this breaks, it's definitely a problem.
-SELECT assert(estimated_y IN (0,1),
+SELECT assert(y IN (0,1),
     'Predicted value not in set of defined class values for model')
 FROM cifar10_predict;
 
@@ -76,7 +76,7 @@
     FALSE);$TRAP$) = 1,
     'Passing batched image table to predict should error out.');
 
--- Test with pred_type=prob
+-- Test with pred_type=0.2
 DROP TABLE IF EXISTS cifar10_predict;
 SELECT madlib_keras_predict(
     'keras_saved_out',
@@ -84,18 +84,33 @@
     'id',
     'x',
     'cifar10_predict',
-    'prob',
+    0.2,
     FALSE);
 
-SELECT assert(UPPER(pg_typeof(prob_0)::TEXT) =
-    'DOUBLE PRECISION', 'column prob_0 should be double precision type')
+SELECT assert(UPPER(pg_typeof(prob)::TEXT) =
+    'DOUBLE PRECISION', 'column prob should be double precision type')
     FROM  cifar10_predict;
 
-SELECT assert(UPPER(pg_typeof(prob_1)::TEXT) =
-    'DOUBLE PRECISION', 'column prob_1 should be double precision type')
+SELECT assert(COUNT(*)=4, 'Predict out table must have exactly three cols.')
+FROM pg_attribute
+WHERE attrelid='cifar10_predict'::regclass AND attnum>0;
+
+-- Test with pred_type=2
+DROP TABLE IF EXISTS cifar10_predict;
+SELECT madlib_keras_predict(
+    'keras_saved_out',
+    'cifar_10_sample',
+    'id',
+    'x',
+    'cifar10_predict',
+    2,
+    FALSE);
+
+SELECT assert(UPPER(pg_typeof(prob)::TEXT) =
+    'DOUBLE PRECISION', 'column prob should be double precision type')
     FROM  cifar10_predict;
 
-SELECT assert(COUNT(*)=3, 'Predict out table must have exactly three cols.')
+SELECT assert(COUNT(*)=4, 'Predict out table must have exactly three cols.')
 FROM pg_attribute
 WHERE attrelid='cifar10_predict'::regclass AND attnum>0;
 
@@ -139,7 +154,7 @@
     'y_text',
     'x',
     'text',
-    ARRAY[NULL,'cat','dog',NULL,NULL],
+    ARRAY[NULL,'cat','dog','bird','fish'],
     1,
     255.0);
 
@@ -165,25 +180,16 @@
     'id',
     'x',
     'cifar10_predict',
-    'prob',
+    0.2,
     FALSE);
 
 -- Validate the output datatype of newly created prediction columns
 -- for prediction type = 'prob' and class_values 'TEXT' with NULL as a valid
 -- class_values
-SELECT assert(UPPER(pg_typeof(prob_cat)::TEXT) =
-    'DOUBLE PRECISION', 'column prob_cat should be double precision type')
+SELECT assert(UPPER(pg_typeof(prob)::TEXT) =
+    'DOUBLE PRECISION', 'column prob should be double precision type')
 FROM cifar10_predict;
 
-SELECT assert(UPPER(pg_typeof(prob_dog)::TEXT) =
-    'DOUBLE PRECISION', 'column prob_dog should be double precision type')
-FROM cifar10_predict;
-
-SELECT assert(UPPER(pg_typeof("prob_NULL")::TEXT) =
-    'DOUBLE PRECISION', 'column prob_NULL should be double precision type')
-FROM cifar10_predict;
-
--- Must have exactly 4 cols (3 for class_values and 1 for id)
 SELECT assert(COUNT(*)=4, 'Predict out table must have exactly four cols.')
 FROM pg_attribute
 WHERE attrelid='cifar10_predict'::regclass AND attnum>0;
@@ -202,7 +208,7 @@
 -- Validate the output datatype of newly created prediction columns
 -- for prediction type = 'response' and class_values 'TEXT' with NULL
 -- as a valid class_values
-SELECT assert(UPPER(pg_typeof(estimated_y_text)::TEXT) =
+SELECT assert(UPPER(pg_typeof(y_text)::TEXT) =
     'TEXT', 'prediction column should be TEXT type')
 FROM  cifar10_predict LIMIT 1;
 
@@ -210,7 +216,7 @@
 -- in input summary table will be NULL.
 UPDATE keras_saved_out_summary SET class_values=NULL;
 
--- Predict with pred_type=prob
+-- Predict with pred_type=all
 DROP TABLE IF EXISTS cifar10_predict;
 SELECT madlib_keras_predict(
     'keras_saved_out',
@@ -221,12 +227,8 @@
     'prob',
     FALSE);
 
--- Validate the output datatype of newly created prediction column
--- for prediction type = 'response' and class_value = NULL
--- Returns: Array of probabilities for user's one-hot encoded data
-SELECT assert(UPPER(pg_typeof(prob)::TEXT) =
-    'DOUBLE PRECISION[]', 'column prob should be double precision[] type')
-FROM cifar10_predict LIMIT 1;
+SELECT assert(count(0) = 5, 'y should get 5 values because dependent_var = [0,0,1,0,0]')
+FROM cifar10_predict WHERE id = 0;
 
 -- Predict with pred_type=response
 DROP TABLE IF EXISTS cifar10_predict;
@@ -243,8 +245,8 @@
 -- for prediction type = 'response' and class_value = NULL
 -- Returns: Index of class value in user's one-hot encoded data with
 -- highest probability
-SELECT assert(UPPER(pg_typeof(estimated_y_text)::TEXT) =
-    'TEXT', 'column estimated_y_text should be text type')
+SELECT assert(UPPER(pg_typeof(y_text)::TEXT) =
+    'TEXT', 'column y_text should be text type')
 FROM cifar10_predict LIMIT 1;
 
 -- Test predict with INTEGER class_values
@@ -266,17 +268,8 @@
     'prob',
     FALSE);
 
--- Validate the output datatype of newly created prediction column
--- for prediction type = 'prob' and class_values 'INT' with NULL
--- as a valid class_values
-SELECT assert(UPPER(pg_typeof("prob_NULL")::TEXT) =
-    'DOUBLE PRECISION', 'column prob_NULL should be double precision type')
-FROM cifar10_predict;
-
--- Must have exactly 6 cols (5 for class_values and 1 for id)
-SELECT assert(COUNT(*)=6, 'Predict out table must have exactly six cols.')
-FROM pg_attribute
-WHERE attrelid='cifar10_predict'::regclass AND attnum>0;
+SELECT assert(count(*)=5, 'Predict out table must have 5 different y values')
+FROM cifar10_predict WHERE id = 0;
 
 -- Predict with pred_type=response
 DROP TABLE IF EXISTS cifar10_predict;
@@ -293,9 +286,9 @@
 -- for prediction type = 'response' and class_values 'TEXT' with NULL
 -- as a valid class_values
 -- Returns: class_value with highest probability
-SELECT assert(UPPER(pg_typeof(estimated_y)::TEXT) =
-    'SMALLINT', 'prediction column should be smallint type')
-FROM cifar10_predict;
+
+SELECT assert(count(*)=1, 'Predict out table must have a single response')
+FROM cifar10_predict WHERE id = 0;
 
 -- Predict with correctly shaped data, must go thru.
 -- Update output_summary table to reflect
@@ -395,7 +388,7 @@
     'Predict output validation failed.')
 FROM iris_multiple_model_info i,
 (SELECT count(*)/(150*0.8) AS test_accuracy FROM
-    (SELECT iris_train.class_text AS actual, iris_predict.estimated_class_text AS estimated
+    (SELECT iris_train.class_text AS actual, iris_predict.class_text AS estimated
      FROM iris_predict INNER JOIN iris_train
      ON iris_train.id=iris_predict.id)q
      WHERE q.actual=q.estimated) q2

diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict_byom.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict_byom.sql_in
index 12dee6f..bd17bec 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict_byom.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_predict_byom.sql_in

@@ -38,7 +38,8 @@
                             'iris_test',  -- test_table
                             'id',  -- id column
                             'attributes', -- independent var
-                            'iris_predict'  -- output table
+                            'iris_predict',  -- output table
+                            'response'
                             );
 
 -- Copy weights that were learnt from the previous run, for transfer
@@ -61,14 +62,10 @@
                                  );
 
 SELECT assert(
-  p0.estimated_class_text = p1.estimated_dependent_var,
+  p0.class_text = p1.dependent_var,
   'Predict byom failure for non null class value and response pred_type.')
 FROM iris_predict AS p0,  iris_predict_byom AS p1
 WHERE p0.id=p1.id;
-SELECT assert(UPPER(pg_typeof(estimated_dependent_var)::TEXT) = 'TEXT',
-	'Predict byom failure for non null class value and response pred_type.
-	 Expeceted estimated_dependent_var to be of type TEXT')
-FROM  iris_predict_byom LIMIT 1;
 
 -- class_values NULL, pred_type is NULL (response)
 DROP TABLE IF EXISTS iris_predict_byom;
@@ -81,13 +78,9 @@
                                  'iris_predict_byom'
                                  );
 SELECT assert(
-  p1.estimated_dependent_var IN ('0', '1', '2'),
+  p1.dependent_var IN ('0', '1', '2'),
   'Predict byom failure for null class value and null pred_type.')
 FROM iris_predict_byom AS p1;
-SELECT assert(UPPER(pg_typeof(estimated_dependent_var)::TEXT) = 'TEXT',
-	'Predict byom failure for non null class value and response pred_type.
-	 Expeceted estimated_dependent_var to be of type TEXT')
-FROM  iris_predict_byom LIMIT 1;
 
 -- class_values not NULL, pred_type is prob
 DROP TABLE IF EXISTS iris_predict_byom;
@@ -106,13 +99,9 @@
                                  );
 
 SELECT assert(
-  (p1."prob_Iris-setosa" + p1."prob_Iris-virginica" + p1."prob_Iris-versicolor") - 1 < 1e-6,
+  sum(prob) - 1 < 1e-6,
     'Predict byom failure for non null class value and prob pred_type.')
-FROM iris_predict_byom AS p1;
-SELECT assert(UPPER(pg_typeof("prob_Iris-setosa")::TEXT) = 'DOUBLE PRECISION',
-	'Predict byom failure for non null class value and prob pred_type.
-	Expeceted "prob_Iris-setosa" to be of type DOUBLE PRECISION')
-FROM  iris_predict_byom LIMIT 1;
+FROM iris_predict_byom AS p1 GROUP BY id;
 
 -- class_values NULL, pred_type is prob
 DROP TABLE IF EXISTS iris_predict_byom;
@@ -128,10 +117,6 @@
                                  NULL
                                  );
 SELECT assert(
-  (prob[1] + prob[2] + prob[3]) - 1 < 1e-6,
+  sum(prob) - 1 < 1e-6,
     'Predict byom failure for null class value and prob pred_type.')
-FROM iris_predict_byom;
-SELECT assert(UPPER(pg_typeof(prob)::TEXT) = 'DOUBLE PRECISION[]',
-	'Predict byom failure for null class value and prob pred_type. Expeceted prob to
-	be of type DOUBLE PRECISION[]')
-FROM  iris_predict_byom LIMIT 1;
+FROM iris_predict_byom GROUP BY id;

diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index d3b2cd7..774c943 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in

@@ -522,9 +522,9 @@
         is_response = True
         result = self.subject.internal_keras_predict(
             self.independent_var, self.model.to_json(),
-            serialized_weights, is_response, 255, 0, self.all_seg_ids,
+            serialized_weights, 255, 0, self.all_seg_ids,
             self.total_images_per_seg, False, 0, 4, **k)
-        self.assertEqual(1, len(result))
+        self.assertEqual(2, len(result))
         self.assertEqual(1,  k['SD']['row_count'])
         self.assertEqual(True, 'segment_model_predict' in k['SD'])
 
@@ -535,9 +535,9 @@
         k['SD']['segment_model_predict'] = self.model
         is_response = True
         result = self.subject.internal_keras_predict(
-            self.independent_var, None, None, is_response, 255, 0,
+            self.independent_var, None, None, 255, 0,
             self.all_seg_ids, self.total_images_per_seg, False, 0, 4, **k)
-        self.assertEqual(1, len(result))
+        self.assertEqual(2, len(result))
         self.assertEqual(2,  k['SD']['row_count'])
         self.assertEqual(True, 'segment_model_predict' in k['SD'])
 
@@ -549,9 +549,9 @@
         k['SD']['segment_model_predict'] = self.model
         is_response = True
         result = self.subject.internal_keras_predict(
-            self.independent_var, None, None, is_response, 255, 0,
+            self.independent_var, None, None, 255, 0,
             self.all_seg_ids, self.total_images_per_seg, False, 0, 4, **k)
-        self.assertEqual(1, len(result))
+        self.assertEqual(3, len(result))
         self.assertEqual(False, 'row_count' in k['SD'])
         self.assertEqual(False, 'segment_model_predict' in k['SD'])
 
@@ -559,7 +559,7 @@
         k['SD']['segment_model_predict'] = self.model
         is_response = False
         result = self.subject.internal_keras_predict(
-            self.independent_var, None, None, is_response, 255, 0,
+            self.independent_var, None, None, 255, 0,
             self.all_seg_ids, self.total_images_per_seg, False, 0, 4, **k)
 
         # we except len(result) to be 3 because we have 3 dense layers in the
@@ -580,7 +580,7 @@
         is_response = True
         with self.assertRaises(plpy.PLPYException):
             self.subject.internal_keras_predict(
-                self.independent_var, None, None, is_response, normalizing_const,
+                self.independent_var, None, None, normalizing_const,
                 0, self.all_seg_ids, self.total_images_per_seg, False, 0, 4, **k)
         self.assertEqual(False, 'row_count' in k['SD'])
         self.assertEqual(False, 'segment_model_predict' in k['SD'])
@@ -626,11 +626,10 @@
                                  'model_id', 'test_table', 'id_col',
                                  'independent_varname', 'output_table', None,
                                  True, None, None)
-        self.assertEqual('response', res.pred_type)
+        self.assertEqual('prob', res.pred_type)
         self.assertEqual(2, res.gpus_per_host)
         self.assertEqual([0,1,2,3,4], res.class_values)
         self.assertEqual(1.0, res.normalizing_const)
-        self.assertEqual('text', res.dependent_vartype)
 
     def test_predictbyom_defaults_2(self):
         res = self.module.PredictBYOM('schema_madlib', 'model_arch_table',
@@ -642,7 +641,6 @@
         self.assertEqual(0, res.gpus_per_host)
         self.assertEqual(['foo', 'bar', 'baaz', 'foo2', 'bar2'], res.class_values)
         self.assertEqual(255.0, res.normalizing_const)
-        self.assertEqual('double precision', res.dependent_vartype)
 
     def test_predictbyom_exception_invalid_params(self):
         with self.assertRaises(plpy.PLPYException) as error:

diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 577235e..946b139 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in

@@ -1286,3 +1286,15 @@
 def is_platform_gp6_or_up():
     version_wrapper = __mad_version()
     return not is_platform_pg() and not version_wrapper.is_gp_version_less_than('6.0')
+
+def get_psql_type(py_type):
+    if type(py_type) == int:
+        return 'integer'
+    elif type(py_type) == float:
+        return 'double precision'
+    elif type(py_type) == bool:
+        return 'boolean'
+    elif type(py_type) == str:
+        return 'varchar'
+    else:
+        plpy.error("Cannot determine the type of {0}".format(py_type))
commit	38e399f2fa749e8b8d544a1de6e56666a410490c	[log] [tgz]
author	Orhan Kislal <okislal@apache.org>	Thu Aug 27 22:16:25 2020 -0400
committer	GitHub <noreply@github.com>	Thu Aug 27 22:16:25 2020 -0400
tree	89b8dfe0f640a06ea85b7205e7722787eb376b88
parent	f6fdef8acbed20c4fa47410f6f9bdc85340f40e0 [diff]