MLP: Fix bug in weights argument JIRA: MADLIB-1471 When passing in the weights param as a column in the table, it would fail with the error `column does not exist`. This column was missed when calling the step function. If it were passed in as a constant value, this would work fine. We fixed this by adding the weights column to the normalized/scaled table. Additionally, there was a bug in validating the weights column type, it validated only for integer and float types and would fail for other numeric types. The intented column type was all numeric types. This commit fixes both these bugs and adds tests for passing in weights as a table column. Co-authored-by: Ekta Khanna <ekhanna@vmware.com>

commit: 0b300cbf0e9257b423ed6bc74d1aeb86bf466779 [log] [tgz]
author: Nikhil Kak <nkak@vmware.com> Wed Feb 24 18:20:32 2021 -0800
committer: Ekta Khanna <ekhanna@pivotal.io> Thu Mar 04 13:18:17 2021 -0800
tree: c435fc734424dd8a1ef06a6dd818ba5b95486652
parent: 4bac9009f9c3be9378873fd5e9bbb7247c55c9e2 [diff]
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 956ea4c..329a426 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in

@@ -415,7 +415,8 @@
                                         schema_madlib=args["schema_madlib"],
                                         x_mean_table=args["x_mean_table"],
                                         y_mean_table='',
-                                        grouping_col=args["grouping_col"])
+                                        grouping_col=args["grouping_col"],
+                                        weights=args["weights"])
     else:
         # When no grouping_col is defined, the mean and std for 'x'
         # can be defined using strings, stored in x_mean_str, x_std_str.
@@ -440,7 +441,8 @@
                                x_std_str=x_std_str,
                                y_mean='',
                                y_std='',
-                               grouping_col=args["grouping_col"])
+                               grouping_col=args["grouping_col"],
+                               weights=args["weights"])
 
     return None
 # ------------------------------------------------------------------------
@@ -735,10 +737,7 @@
                 "MLP Error: The input weights param is not supported with"
                 " mini-batch version of MLP.")
     else:
-        int_types = ['integer', 'smallint', 'bigint']
-        float_types = ['double precision', 'real']
-        _assert(get_expr_type(weights, source_table) in int_types + float_types,
-                "MLP error: Weights should be a numeric type")
+        is_valid_psql_type(get_expr_type(weights, source_table), NUMERIC)
         _assert(array_col_has_same_dimension(source_table, independent_varname),
                 "Independent variable column should refer to arrays of the same length")
 

diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 9053df7..056179a 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in

@@ -308,6 +308,63 @@
     'mlp_prediction_output',
     'output');
 
+-- Test for passing NUMERIC row_weights with grouping
+DROP TABLE IF EXISTS iris_data_row_weight;
+CREATE TABLE iris_data_row_weight
+  AS SELECT *, id::NUMERIC AS row_weight FROM iris_data;
+DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
+SELECT mlp_classification(
+    'iris_data_row_weight',    -- Source table
+    'mlp_class',    -- Desination table
+    'attributes',   -- Input features
+    'class',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=5,
+    n_tries=3,
+    tolerance=0',
+    'sigmoid',
+    'row_weight',
+    False,
+    False,
+    'grp'
+);
+DROP TABLE IF EXISTS mlp_prediction_output;
+SELECT mlp_predict(
+    'mlp_class',
+    'iris_data',
+    'id',
+    'mlp_prediction_output',
+    'output');
+
+-- Test for passing NUMERIC row_weights without grouping
+DROP TABLE IF EXISTS iris_data_row_weight;
+CREATE TABLE iris_data_row_weight
+  AS SELECT *, id::NUMERIC AS row_weight FROM iris_data;
+DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
+SELECT mlp_classification(
+    'iris_data_row_weight',    -- Source table
+    'mlp_class',    -- Desination table
+    'attributes',   -- Input features
+    'class',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=5,
+    n_tries=3,
+    tolerance=0',
+    'sigmoid',
+    'row_weight'
+);
+DROP TABLE IF EXISTS mlp_prediction_output;
+SELECT mlp_predict(
+    'mlp_class',
+    'iris_data',
+    'id',
+    'mlp_prediction_output',
+    'output');
+
 -- minibatch without grouping and without warm_start
 DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
 SELECT mlp_classification(
@@ -1006,6 +1063,66 @@
              )
 FROM mlp_prediction_regress  LIMIT 1;
 
+-- with weights with grouping without minibatch  without warm start
+CREATE TABLE lin_housing_wi_with_row_weight AS SELECT *, (id%3) +1 AS row_weight FROM lin_housing_wi;
+DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
+SELECT mlp_regression(
+               'lin_housing_wi_with_row_weight',           -- Source table
+               'mlp_regress',              -- Desination table
+               'x',                        -- Input features
+               'y',                        -- Dependent variable
+               ARRAY[40],                 -- Number of units per layer
+               'learning_rate_init=0.015,
+               learning_rate_policy=inv,
+               n_iterations=5, n_tries=3,
+               tolerance=0',
+               'sigmoid',
+               'row_weight',
+               False,
+               False,
+               'grp');
+DROP TABLE IF EXISTS mlp_prediction_regress;
+SELECT mlp_predict(
+               'mlp_regress',
+               'lin_housing_wi',
+               'id',
+               'mlp_prediction_regress',
+               'output');
+SELECT assert(
+                   __to_char(pg_typeof(estimated_y)) = 'double precision[]',
+                   'Estimated y should be an array. Actual ' || __to_char(pg_typeof(estimated_y))
+           )
+FROM mlp_prediction_regress  LIMIT 1;
+
+-- with weights without grouping without minibatch  without warm start
+DROP TABLE IF EXISTS mlp_regress, mlp_regress_summary, mlp_regress_standardization;
+SELECT mlp_regression(
+               'lin_housing_wi_with_row_weight',           -- Source table
+               'mlp_regress',              -- Desination table
+               'x',                        -- Input features
+               'y',                        -- Dependent variable
+               ARRAY[40],                 -- Number of units per layer
+               'learning_rate_init=0.015,
+               learning_rate_policy=inv,
+               n_iterations=5, n_tries=3,
+               tolerance=0',
+               'sigmoid',
+               'row_weight',
+               False,
+               False);
+DROP TABLE IF EXISTS mlp_prediction_regress;
+SELECT mlp_predict(
+               'mlp_regress',
+               'lin_housing_wi',
+               'id',
+               'mlp_prediction_regress',
+               'output');
+SELECT assert(
+                   __to_char(pg_typeof(estimated_y)) = 'double precision[]',
+                   'Estimated y should be an array. Actual ' || __to_char(pg_typeof(estimated_y))
+           )
+FROM mlp_prediction_regress  LIMIT 1;
+
 -- minibatch without grouping and without warm start
 DROP TABLE IF EXISTS mlp_regress_batch, mlp_regress_batch_summary, mlp_regress_batch_standardization;
 SELECT mlp_regression(

diff --git a/src/ports/postgres/modules/convex/utils_regularization.py_in b/src/ports/postgres/modules/convex/utils_regularization.py_in
index 6ed98bf..2b9a279 100644
--- a/src/ports/postgres/modules/convex/utils_regularization.py_in
+++ b/src/ports/postgres/modules/convex/utils_regularization.py_in

@@ -239,6 +239,7 @@
         y_mean_join_clause = "INNER JOIN {0} AS __y__ ON {1}".format(
             kwargs.get('y_mean_table'), group_where_y)
     ydecenter_str = "- __y__.mean".format(**kwargs) if y_decenter else ""
+    weights_str = ", {weights}".format(**kwargs) if 'weights' in kwargs else ""
     plpy.execute("""
             CREATE TEMP TABLE {tbl_data_scaled}
             m4_ifdef(`__POSTGRESQL__', `', `WITH (appendonly=true)')
@@ -250,6 +251,7 @@
                     AS {col_ind_var_norm_new},
                 ({col_dep_var} {ydecenter_str})  AS {col_dep_var_norm_new},
                 {select_grouping_cols}
+                {weights_str}
             FROM {tbl_data}
             {x_mean_join_clause}
             {y_mean_join_clause}
@@ -257,6 +259,7 @@
                    x_mean_join_clause=x_mean_join_clause,
                    y_mean_join_clause=y_mean_join_clause,
                    select_grouping_cols=select_grouping_cols,
+                   weights_str=weights_str,
                    **kwargs))
     return None
 # ========================================================================
@@ -286,6 +289,7 @@
                        to be compatible with array[...] expressions
     """
     ydecenter_str = "- {y_mean}".format(**kwargs) if y_decenter else ""
+    weights_str = ", {weights}".format(**kwargs) if 'weights' in kwargs else ""
     plpy.execute(
         """
             CREATE TEMP TABLE {tbl_data_scaled}
@@ -298,8 +302,11 @@
                         '{x_std_str}'::double precision[]))
                     AS {col_ind_var_norm_new},
                 ({col_dep_var} {ydecenter_str})  AS {col_dep_var_norm_new}
+                {weights_str}
             FROM {tbl_data}
-        """.format(ydecenter_str=ydecenter_str, **kwargs))
+        """.format(ydecenter_str=ydecenter_str,
+                   weights_str=weights_str,
+                   **kwargs))
 
     return None
 # ========================================================================
commit	0b300cbf0e9257b423ed6bc74d1aeb86bf466779	[log] [tgz]
author	Nikhil Kak <nkak@vmware.com>	Wed Feb 24 18:20:32 2021 -0800
committer	Ekta Khanna <ekhanna@pivotal.io>	Thu Mar 04 13:18:17 2021 -0800
tree	c435fc734424dd8a1ef06a6dd818ba5b95486652
parent	4bac9009f9c3be9378873fd5e9bbb7247c55c9e2 [diff]