DL: Set plan_cache_mode when passing weights for GPDB6 JIRA: MADLIB-1414 For versions >=GPDB6, previously, for queries called with weights value passed in as a param, the query plan for it would create custom plans with weights embedded in the plan itself. This meant that the query plan size would also include the size of these weights, bloating it up to hit the 1GB limit when dispatching the query plan to segments, leading to OOM for large weights. In GPDB, for PREPARE plans, there is a GUC `plan_cache_mode` that controls choosing generic plan(not constant folding the passed in params)/custom plans(constant folding the passed in params) for execution. Therefore, for GPDB6 setting this GUC to `force_generic_plan` for using generic plans when passing in weights as params to queries.

commit: fe7ba29d08328354a2626cd4b69b8eb69ac1f16b [log] [tgz]
author: Ekta Khanna <ekhanna@pivotal.io> Mon Mar 09 11:04:01 2020 -0700
committer: Ekta Khanna <ekhanna@pivotal.io> Mon Mar 16 10:35:26 2020 -0700
tree: 5131c4290d8717e83c8f273613385067ce03d562
parent: 4e4a143af9537a8560716567061c75ffb4bbdd5c [diff]
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 888bed9..956ea4c 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in

@@ -1051,8 +1051,8 @@
         else:
             intermediate_col = unique_string()
             if classes:
-                score_format = create_cols_from_array_sql_string(
-                    classes, intermediate_col, 'prob', 'double precision', False, 'MLP') 
+                score_format, _ = create_cols_from_array_sql_string(
+                    classes, intermediate_col, 'prob', 'double precision', False, 'MLP')
             else:
                 # Case when the training step did not have to one-hot encode
                 # the dependent var.

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 596b0b2..d6fb857 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in

@@ -313,11 +313,15 @@
                  [compile_params, fit_params, name,
                   description, metrics_elapsed_time, class_values])
 
-    create_output_table = plpy.prepare("""
-        CREATE TABLE {0} AS SELECT
-        $1 as model_weights,
-        $2 as {1}""".format(model, ModelArchSchema.MODEL_ARCH), ["bytea", "json"])
-    plpy.execute(create_output_table, [serialized_weights, model_arch])
+    plpy.execute("""
+        CREATE TABLE {0}
+        (model_weights bytea,
+        {1} json)""".format(model, ModelArchSchema.MODEL_ARCH))
+    insert_output_table = plpy.prepare("""
+        INSERT INTO {0} SELECT model_weights, {1}
+        FROM (VALUES($1, $2))t(model_weights, {1})
+        """.format(model, ModelArchSchema.MODEL_ARCH), ["bytea", "json"])
+    plpy.execute(insert_output_table, [serialized_weights, model_arch])
 
     #TODO add a unit test for this in a future PR
     reset_cuda_env(original_cuda_env)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 1d2d0ba..e4794a3 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in

@@ -1643,8 +1643,10 @@
     description             VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras.fit(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras.fit(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1836,16 +1838,18 @@
     mst_key                 INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras_predict.Predict(schema_madlib,
-               model_table,
-               test_table,
-               id_col,
-               independent_varname,
-               output_table,
-               pred_type,
-               use_gpus,
-               mst_key)
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.Predict(schema_madlib,
+                   model_table,
+                   test_table,
+                   id_col,
+                   independent_varname,
+                   output_table,
+                   pred_type,
+                   use_gpus,
+                   mst_key)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1917,8 +1921,10 @@
     normalizing_const       DOUBLE PRECISION
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras_predict.PredictBYOM(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.PredictBYOM(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1986,7 +1992,11 @@
     use_gpus                BOOLEAN,
     mst_key                 INTEGER
 ) RETURNS VOID AS $$
-    PythonFunction(`deep_learning', `madlib_keras', `evaluate')
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras.evaluate(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index 1b0e675..df79d86 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in

@@ -1398,8 +1398,10 @@
     description             VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_fit_multiple_model')
+    from utilities.control import SetGUC
     with AOControl(False):
-        fit_obj = madlib_keras_fit_multiple_model.FitMultipleModel(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            fit_obj = madlib_keras_fit_multiple_model.FitMultipleModel(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 16ae7b1..412e63b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in

@@ -35,6 +35,7 @@
 from utilities.utilities import create_cols_from_array_sql_string
 from utilities.utilities import get_segments_per_host
 from utilities.utilities import unique_string
+from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
 
 from madlib_keras_wrapper import *
@@ -78,7 +79,7 @@
         intermediate_col = unique_string()
         class_values = strip_trailing_nulls_from_class_values(self.class_values)
 
-        prediction_select_clause = create_cols_from_array_sql_string(
+        prediction_select_clause, create_table_columns = create_cols_from_array_sql_string(
             class_values, intermediate_col, pred_col_name,
             pred_col_type, self.is_response, self.module_name)
         gp_segment_id_col, seg_ids_test, \
@@ -94,14 +95,20 @@
             group_by_clause = "GROUP BY {self.test_table}.gp_segment_id".format(self=self)
             join_cond_on_segmentid = "{self.test_table}.gp_segment_id=min_ctid.gp_segment_id AND".format(self=self)
 
+        # Calling CREATE TABLE instead of CTAS, to ensure that the plan_cache_mode
+        # guc codepath is called when passing in the weights
+        plpy.execute("""
+            CREATE TABLE {self.output_table}
+            ({self.id_col} {self.id_col_type}, {create_table_columns})
+            """.format(self=self, create_table_columns=create_table_columns))
         # Passing huge model weights to internal_keras_predict() for each row
         # resulted in slowness of overall madlib_keras_predict().
         # To avoid this, a CASE is added to pass the model weights only for
         # the very first row(min(ctid)) that is fetched on each segment and NULL
         # for the other rows.
         predict_query = plpy.prepare("""
-            CREATE TABLE {self.output_table} AS
-            SELECT {self.id_col}, {prediction_select_clause}
+            INSERT INTO {self.output_table}
+            SELECT {self.id_col}::{self.id_col_type}, {prediction_select_clause}
             FROM (
                 SELECT {self.test_table}.{self.id_col},
                        ({self.schema_madlib}.internal_keras_predict
@@ -175,6 +182,7 @@
         self.dependent_varname = param_proc.get_dependent_varname()
 
         self.validate()
+        self.id_col_type = get_expr_type(self.id_col, self.test_table)
         BasePredict.call_internal_keras(self)
         if self.is_mult_model:
             plpy.execute("DROP VIEW IF EXISTS {}".format(self.temp_summary_view))
@@ -230,6 +238,7 @@
             self.test_table, self.id_col, self.output_table,
             self.independent_varname)
         self.validate_and_set_defaults()
+        self.id_col_type = get_expr_type(self.id_col, self.test_table)
         BasePredict.call_internal_keras(self)
 
     def validate_and_set_defaults(self):

diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
index b884eec..0318516 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in

@@ -254,15 +254,17 @@
         self.colname = 'estimated_col'
         self.coltype = 'dummy'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, 'sqlcol[1]+1 AS estimated_col')
+        self.assertEqual(out_col, 'estimated_col dummy')
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, 'sqlcol AS estimated_col')
+        self.assertEqual(out_col, 'estimated_col dummy[]')
 
     def test_create_cols_from_array_sql_string_one_ele(self):
         utils = self.subject
@@ -271,10 +273,11 @@
         self.colname = 'estimated_pred'
         self.coltype = 'TEXT'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertTrue(out_sql, "(ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred")
+        self.assertTrue(out_col, "estimated_pred TEXT")
 
     def test_create_cols_from_array_sql_string_one_ele_with_NULL(self):
         utils = self.subject
@@ -283,10 +286,11 @@
         self.colname = 'estimated_pred'
         self.coltype = 'INTEGER'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "(ARRAY[ NULL,1,2 ]::INTEGER[])[sqlcol[1]+1]::INTEGER AS estimated_pred")
+        self.assertEqual(out_col, "estimated_pred INTEGER")
 
     def test_create_cols_from_array_sql_string_one_ele_with_many_NULL(self):
         utils = self.subject
@@ -307,10 +311,11 @@
         self.colname = 'prob'
         self.coltype = 'TEXT'
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_cat\", CAST(sqlcol[2] AS TEXT) AS \"prob_dog\"")
+        self.assertEqual(out_col, "\"prob_cat\" TEXT, \"prob_dog\" TEXT")
 
     def test_create_cols_from_array_sql_string_many_ele_with_NULL(self):
         utils = self.subject
@@ -319,10 +324,11 @@
         self.colname = 'prob'
         self.coltype = 'TEXT'
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_NULL\", CAST(sqlcol[2] AS TEXT) AS \"prob_cat\", CAST(sqlcol[3] AS TEXT) AS \"prob_dog\"")
+        self.assertEqual(out_col, "\"prob_NULL\" TEXT, \"prob_cat\" TEXT, \"prob_dog\" TEXT")
 
     def test_create_cols_from_array_sql_string_many_ele_with_many_NULL(self):
         utils = self.subject

diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 12b5205..210fbd4 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in

@@ -431,7 +431,8 @@
                                       coltype, has_one_ele,
                                       module_name='Input Error'):
     """
-    Create SQL string to convert array of elements into multiple columns.
+    Create SQL string to convert array of elements into multiple columns and corresponding
+    SQL string of columns for CREATE TABLE.
     @args:
         @param: py_list, python list, if None, return sql_array_col as colname.
                             The py_list can at most have one 'None' element that
@@ -454,6 +455,7 @@
                 coltype = TEXT
                 has_one_ele = FALSE
             Output:
+                prob_cat TEXT, prob_dog TEXT
                 CAST(sqlcol[1] AS TEXT) AS prob_cat, CAST(sqlcol[2] AS TEXT) AS prob_dog
         2) Input:
                 py_list = ['cat', 'dog']
@@ -462,23 +464,25 @@
                 coltype = TEXT
                 has_one_ele = TRUE
             Output:
+                estimated_pred TEXT
                 (ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred
 
     @NOTE:
         If py_list is [None, 'cat', 'dog', NULL']:
         then the SQL query string returned would create the following
         column names:
-            prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL'.
+            prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL"'.
         1. Notice that for None, which represents Postgres' NULL value, the
         column name will be 'prob_NULL',
         2. and to differentiate the column name for a string 'NULL', the
-        resulting column name will be 'prob_"NULL'.
+        resulting column name will be 'prob_"NULL"'.
 
         The weird quoting in this column name is due to calling strip after
         quote_ident in the code below.
 
     @returns:
         @param, str, that can be used in a SQL query.
+        @param, str, that can be used in a SQL query.
 
     """
     _assert(sql_array_col, "{0}: sql_array_col should be a valid string.".
@@ -515,6 +519,8 @@
             py_list_sql_str = py_list_to_sql_string(py_list, coltype+'[]')
             select_clause = "({0})[{1}[1]+1]::{2} AS {3}".format(
                 py_list_sql_str, sql_array_col, coltype, colname)
+            create_columns = "{0} {1}".format(
+                colname, coltype)
         else:
             # Create as many columns as the length of py_list. The
             # colnames are created based on the elements in py_list,
@@ -534,12 +540,21 @@
                            coltype=coltype)
                 for i, suffix in enumerate(py_list)
                 ])
+            create_columns = ', '.join(
+                ['"{final_colname}" {coltype}'.
+                     format(final_colname=quote_ident("{0}_{1}".
+                                                      format(colname, str(suffix))).strip(' "'),
+                            coltype=coltype)
+                 for i, suffix in enumerate(py_list)
+                 ])
     else:
         if has_one_ele:
             select_clause = '{0}[1]+1 AS {1}'.format(sql_array_col, colname)
+            create_columns = '{0} {1}'.format(colname, coltype)
         else:
             select_clause = '{0} AS {1}'.format(sql_array_col, colname)
-    return select_clause
+            create_columns = '{0} {1}'.format(colname, coltype+'[]')
+    return select_clause, create_columns
 # ------------------------------------------------------------------------
 
 def _array_to_string(origin):
commit	fe7ba29d08328354a2626cd4b69b8eb69ac1f16b	[log] [tgz]
author	Ekta Khanna <ekhanna@pivotal.io>	Mon Mar 09 11:04:01 2020 -0700
committer	Ekta Khanna <ekhanna@pivotal.io>	Mon Mar 16 10:35:26 2020 -0700
tree	5131c4290d8717e83c8f273613385067ce03d562
parent	4e4a143af9537a8560716567061c75ffb4bbdd5c [diff]