src/ports/postgres/modules/elastic_net/elastic_net.py_in - madlib - Git at Google


 import plpy
 from elastic_net_models import _elastic_net_gaussian_igd_train
 from elastic_net_models import _elastic_net_gaussian_fista_train
 from elastic_net_models import _elastic_net_binomial_fista_train
 from elastic_net_models import _elastic_net_binomial_igd_train
 from elastic_net_utils import _generate_warmup_lambda_sequence

 from elastic_net_utils import BINOMIAL_FAMILIES, GAUSSIAN_FAMILIES, OPTIMIZERS

 from utilities.validate_args import is_col_array
 from utilities.utilities import is_string_formatted_as_array_expression
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import get_cols_and_types
 from utilities.utilities import get_grouping_col_str
 from utilities.validate_args import cols_in_tbl_valid
 from utilities.validate_args import explicit_bool_to_text
 from utilities.utilities import extract_keyvalue_params

 from utilities.control import MinWarning

 from utilities.utilities import _string_to_array_with_quotes
 from utilities.utilities import _string_to_array
 from utilities.utilities import is_psql_numeric_type
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
 import re

 from validation.internal.cross_validation import CrossValidator
 # ------------------------------------------------------------------------


 def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
     """
     Given a response family name or optimizer name, print out the related
     information.

     If a family name is given, print out the supported optimizer together
     with its default optimizer.

     If an optimizer name is given, print out the necessary parameters.
     """
     if (family_or_optimizer is None or
             family_or_optimizer.lower() in ("help", "?")):
         return """
     ----------------------------------------------------------------
                             Summary
     ----------------------------------------------------------------
     Right now, gaussian (linear) and binomial (logistic) families
     are supported!
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('gaussian');
     or
     SELECT {schema_madlib}.elastic_net_train('binomial');
     to see more help.
     --
     Run:  SELECT {schema_madlib}.elastic_net_train('usage');
     to see how to use.
     --
     Run:  SELECT {schema_madlib}.elastic_net_train('predict');
     to see how to predict.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ('usage', 'help', '?')):
         return """
     ----------------------------------------------------------------
                             USAGE
     ----------------------------------------------------------------
     SELECT {schema_madlib}.elastic_net_train (
         'tbl_source',      -- Data table
         'tbl_result',      -- Result table
         'col_dep_var',     -- Dependent variable, can be an expression or
                                 '*'
         'col_ind_var',     -- Independent variable, can be an expression
         'regress_family',  -- 'gaussian' (or 'linear'). 'binomial'
                                 (or 'logistic')
         alpha,             -- Elastic net controlparameter, value in [0, 1]
         lambda_value,      -- Regularization parameter, positive
         standardize,       --Whether to normalize the ata
         'grouping_col',    -- Group by which columns. (DEFAULT: NULL)
         'optimizer',       --Name of optimizer. (DEFAUT: 'fista')
         'optimizer_params',-- Comma-separated string of optimizer parameters
         'excluded',        -- Column names excluded frm '*' (DEFAULT = NULL)
         max_iter,          -- Maximum iteration numbr (DEFAULT = 1000)
         tolerance          -- Stopping criteria (DEFAULT = 1e-4)
     );
     ----------------------------------------------------------------
                             OUTPUT
     ----------------------------------------------------------------
     The output table ('tbl_result' above) has the following columns:
     grouping_col      TEXT,               --'Distinct values of groupng_col'
     family            TEXT,               --'gaussian' or 'binomial'
     features          TEXT[],             -- All feature column names
     features_selected TEXT[],             -- Features with non-zero coefficients
     coef_nonzero      DOUBLE PRECISION[], -- Non-zero coefficients
     coef_all          DOUBLE PRECISION[], -- All coefficients
     intercept         DOUBLE PRECISION,   -- Intercept of the linear fit
     log_likelihood    DOUBLE PRECISION,   -- log-likelihood of the fit
     standardize       BOOLEAN,            -- Whether the data was standardized
                                           -- before fitting
     iteration_run     INTEGER             -- How many iteration was actually run

     If the independent variable is a column with type of array, features
     and features_selected will output indices of the array.
         """.format(schema_madlib=schema_madlib)

     if family_or_optimizer.lower() == "predict":
         return """
     ----------------------------------------------------------------
                             Prediction
     ----------------------------------------------------------------
     SELECT {schema_madlib}.elastic_net_predict(
         'regress_family', -- 'gaussian' (or 'linear'). 'binomial'
                           --  (or 'logistic') will be supported
         coefficients,     -- Fitting coefficients as a double array
         intercept,
         ind_var           -- independent variables
     ) FROM tbl_result, tbl_new_source;

     When predicting with binomial models, the return value is 1
     if the predicted result is True, and 0 if the prediction is
     False.

     OR -------------------------------------------------------------

     (1) SELECT {schema_madlib}.elastic_net_gaussian_predict (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;

     (2) SELECT {schema_madlib}.elastic_net_binomial_predict (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;


     (3) SELECT {schema_madlib}.elastic_net_binomial_prob (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;

         This returns probability values for the class being 'True'.

     OR -------------------------------------------------------------

     SELECT {schema_madlib}.elastic_net_predict(
         'tbl_model',      -- Result table of elastic_net_train
         'tbl_new_source', -- New data source
         'col_id',         -- Unique ID column
        'tbl_predict'      -- Prediction result
     );
     will put all prediction results into a table. This can be
     used together with cross_validation_general() function.

     When predicting with binomial models, the predicted values
     are BOOLEAN.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ("gaussian", "linear")):
         return """
     ----------------------------------------------------------------
     Fitting linear models
     ----------------------------------------------------------------
     Supported optimizer:
     (1) Incremental gradient descent method ('igd')
     (2) Fast iterative shrinkage thesholding algorithm ('fista')

     Default is 'fista'
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('optimizer');
     to see more help on each optimizer.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ("binomial", "logistic")):
         return """
     ----------------------------------------------------------------
     Fitting logistic models
     ----------------------------------------------------------------
     The dependent variable must be a BOOLEAN.

     Supported optimizer:
     (1) Incremental gradient descent method ('igd')
     (2) Fast iterative shrinkage thesholding algorithm ('fista')

     Default is 'fista'
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('optimizer');
     to see more help on each optimizer.
         """.format(schema_madlib=schema_madlib)

     if family_or_optimizer.lower() == "igd":
         return """
     ----------------------------------------------------------------
     Incremental gradient descent (IGD) method
     ----------------------------------------------------------------
     Right now, it supports fitting both linear and logistic models.

     In order to obtain sparse coefficients, a
     modified version of IGD is actually used.

     Parameters --------------------------------
     stepsize          - default is 0.01
     threshold         - default is 1e-10. When a coefficient is really
                         small, set it to be 0
     warmup            - default is False
     warmup_lambdas    - default is Null
     warmup_lambda_no  - default is 15. How many lambda's are used in
                         warm-up, will be overridden if warmup_lambdas
                         is not NULL
     warmup_tolerance  - default is the same as tolerance. The value
                         of tolerance used during warmup.
     n_folds           - default is 1, Number of cross validation folds.
                         Set this to greater than 1 if CV over lambda is required.
     validation_result - Name of the table to store the cross validation results.
     parallel          - default is True. Run the computation on
                         multiple segments or not.

     When warmup is True or if n_folds > 1, and warmup_lambdas is NULL, a series
     of lambda values will be automatically generated and used.

     Reference --------------------------------
     [1] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1
         Regularized Loss Minimization. Proceedings of the 26th Interna-
         tional Conference on Machine Learning, Montreal, Canada, 2009.
         """

     if family_or_optimizer.lower() == "fista":
         return """
     ----------------------------------------------------------------
     Fast iterative shrinkage thesholding algorithm
     with backtracking for stepsizes
     ----------------------------------------------------------------
     Right now, it supports fitting both linear and logistic models.

     Parameters --------------------------------
     max_stepsize        - default is 4.0
     eta                 - default is 1.2, if stepsize does not work
                           stepsize/eta will be tried
     warmup              - default is False
     warmup_lambdas      - default is NULL, which means that lambda
                           values will be automatically generated
     warmup_lambda_no    - default is 15. How many lambda's are used in
                           warm-up, will be overridden if warmup_lambdas
                           is not NULL
     warmup_tolerance    - default is the same as tolerance. The value
                           of tolerance used during warmup.
     use_active_set      - default is False. Sometimes active-set method
                           can speed up the calculation.
     activeset_tolerance - default is the same as tolerance. The
                           value of tolerance used during active set
                           calculation
     random_stepsize     - default is False. Whether add some randomness
                          to the step size. Sometimes, this can speed
                          up the calculation.
     n_folds             - default is 1. Number of cross validation folds.
                           Set this to greater than 1 if CV over lambda is required.
     validation_result   - Name of the table to store the cross validation results.

     When warmup is True and warmup_lambdas is NULL, warmup_lambda_no
     of lambda values will be automatically generated and used.

     Reference --------------------------------
     [1] Beck, A. and M. Teboulle (2009), A fast iterative
         shrinkage-thresholding algorothm for linear inverse
         problems. SIAM J. on Imaging Sciences 2(1), 183-202.
         """

     # if family_or_optimizer.lower() == "newton":
     #     return "Newton method  "

     return """
     Elastic Net error: Not a supported response family or optimizer
     Run:
     SELECT {schema_madlib}.elastic_net_train();
     for help
     """.format(schema_madlib=schema_madlib)
 # ------------------------------------------------------------------------


 def elastic_net_train(schema_madlib, source_table, model_table, dependent_varname,
                       independent_varname, regress_family, alpha, lambda_value,
                       standardize, grouping_col, optimizer,
                       optimizer_params, excluded, max_iter, tolerance,
                       **kwargs):
     """
     A wrapper for all variants of elastic net regularization.

     @param source_table        Name of data source table
     @param independent_varname       Name of independent variable column,
                              independent variable is an array
     @param dependent_varname       Name of dependent variable column
     @param model_table        Name of the table to store the results,
                              will return fitting coefficients and
                              likelihood
     @param lambda_value      The regularization parameter
     @param alpha             The elastic net parameter, [0, 1]
     @param standardize       Whether to normalize the variables
     @param regress_family    Response type, 'gaussian' or 'binomial'
     @param optimizer         The optimization algorithm, for example 'igd'
     @param optimizer_params  Parameters of the above optimizer, the format
                              is '{arg = value, ...}'::varchar[]
     @param excluded          Which variables are excluded when
                              independent_varname == "*"
     """
     with MinWarning("warning"):
         if regress_family is None:
             plpy.error("""
                        Elastic Net error: Please enter a valid response family name!
                        Run:
                        SELECT {schema_madlib}.elastic_net_train();
                        for supported response family.
                        """.format(schema_madlib=schema_madlib))

         if optimizer is None:
             plpy.error("""
                        Elastic Net error: Please enter a valid optimizer name!
                        Run:
                        SELECT {schema_madlib}.elastic_net_train('gaussian');
                        for supported optimizers.
                        """.format(schema_madlib=schema_madlib))

         regress_family = regress_family.lower()
         optimizer = optimizer.lower()

         if (regress_family not in (BINOMIAL_FAMILIES + GAUSSIAN_FAMILIES) or
                 optimizer not in OPTIMIZERS):
             plpy.error("""
                 Elastic Net error: Not a supported response family or supported
                 optimizer of the given response family!

                 Run:
                     SELECT {schema_madlib}.elastic_net_train();
                 for help.
                 """.format(schema_madlib=schema_madlib))

         cv_param, optimizer_params = _get_cv_optimizer_params(
             optimizer_params, alpha, lambda_value)

         args = locals()
         if cv_param['n_folds'] > 1:
             args.update(cv_param)
             _cross_validate_en(args)
         _internal_elastic_net_train(**args)
 # ------------------------------------------------------------------------


 def _internal_elastic_net_train(
         schema_madlib, source_table, model_table, dependent_varname,
         independent_varname, grouping_col,
         regress_family, alpha, lambda_value,
         standardize, optimizer, optimizer_params, excluded,
         max_iter, tolerance, **kwargs):

     tbl_summary = add_postfix(model_table, "_summary")

     # handle all special cases of independent_varname
     independent_varname, outstr_array = analyze_input_str(
         schema_madlib, source_table,
         independent_varname, dependent_varname, excluded)

     # get the grouping info
     reserved_cols =['regress_family', 'coef_all',
                      'features_selected',
                      'coef_nonzero', 'intercept',
                      'log_likelihood', 'standardize',
                      'iteration_run']
     grouping_str, grouping_col = get_grouping_col_str(schema_madlib, "Elastic Net",
                                                        reserved_cols,
                                                        source_table, grouping_col)

     if regress_family in GAUSSIAN_FAMILIES:
         if optimizer == OPTIMIZERS.igd:
             _elastic_net_gaussian_igd_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
         if optimizer == OPTIMIZERS.fista:
             _elastic_net_gaussian_fista_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
     elif regress_family in BINOMIAL_FAMILIES:
         if optimizer == OPTIMIZERS.igd:
             dependent_varname = "(" + dependent_varname + ")::boolean"
             _elastic_net_binomial_igd_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
         if optimizer == OPTIMIZERS.fista:
             dependent_varname = "(" + dependent_varname + ")::boolean"
             _elastic_net_binomial_fista_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
     return None
 # ----------------------------------------------------------------------


 def _get_cv_optimizer_params(param_str, alpha, smallest_lambda):
     cv_params_defaults = {
         "n_folds": (1, int),
         "lambda_value": (None, list),
         "alpha": ([alpha], list),
         "n_lambdas": (15, int),
         "validation_result": (None, str)
     }
     param_defaults = dict([(k, v[0]) for k, v in cv_params_defaults.items()])
     param_types = dict([(k, v[1]) for k, v in cv_params_defaults.items()])

     if not param_str:
         return param_defaults, param_str

     name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
                                          ignore_invalid=True)
     if name_value['n_folds'] > 1:
         if not name_value['lambda_value']:
             if name_value['n_lambdas']:
                 name_value['lambda_value'] = _generate_warmup_lambda_sequence(
                     smallest_lambda, name_value['n_lambdas'])
                 # no warmup when cross validating on lambda
                 param_str += ', warmup=False'
             else:
                 name_value['lambda_value'] = [float(smallest_lambda)]
         else:
             name_value['lambda_value'] = map(float, name_value['lambda_value'])
             # no warmup when cross validating on lambda
             param_str += ', warmup=False'
         name_value['alpha'] = map(float, name_value['alpha'])
     return name_value, param_str
 # ------------------------------------------------------------------------


 def _cross_validate_en(args):
     # updating params_dict will also update args['params_dict']
     if args['n_folds'] > 1 and args['grouping_col']:
         plpy.error('Elastic Net Error: cross validation with grouping is not supported!')

     allowed_cv_params = ('lambda_value', 'alpha')  # keep trailing comma for single element
     cv_params_values = {}

     for param in allowed_cv_params:
         if isinstance(args[param], list):
             if len(args[param]) > 1:
                 cv_params_values[param] = args[param]
             else:
                 args[param] = args[param][0]

     if not cv_params_values and args['n_folds'] <= 1:
         # no cross validation
         return

     if not cv_params_values and args['n_folds'] > 1:
         plpy.warning('Elastic Net Warning: n_folds > 1 but no '
                      'cross validation parameter provided')
         return

     if cv_params_values and args['n_folds'] <= 1:
         plpy.error('Elastic Net Error: All parameters must be scalar '
                    'when n_folds is 0 or 1')

     scorer = 'classification' if args['regress_family'] in BINOMIAL_FAMILIES else 'regression'
     cv = CrossValidator(_internal_elastic_net_train, elastic_net_predict_all, scorer, args)
     val_res = cv.validate(cv_params_values, args['n_folds'])
     if 'validation_result' in args:
         val_res.output_tbl(args['validation_result'])
     args.update(val_res.top('sub_args'))
 # ------------------------------------------------------------------------------


 def _check_args(tbl_source, col_ind_var, col_dep_var):
     """
     Check arguments before analyze_input_str
     """
     if any(each_arg is None for each_arg in (tbl_source, col_ind_var, col_dep_var)):
         plpy.error("Elastic Net error: You have unsupported NULL value(s) in the arguments!")
     if any(each_arg.strip() == '' for each_arg in (tbl_source, col_ind_var, col_dep_var)):
         plpy.error("Elastic Net error: You have unsupported EMPTY value(s) in the arguments!")

     if not table_exists(tbl_source):
         plpy.error("Elastic Net error: Data table " + tbl_source + " does not exist!")

     if table_is_empty(tbl_source):
         plpy.error("Elastic Net error: Data table " + tbl_source + " is empty!")
 # ------------------------------------------------------------------------


 def analyze_input_str(schema_madlib, tbl_source,
                       col_ind_var, col_dep_var, excluded):
     """
     Make input strings and output strings compatible with functions

     @param tbl_source Data table
     @param col_ind_var Independent variables
     @param col_dep_var Dependent variables
     @param excluded Which variables are excluded when col_ind_var == "*"
     """
     _check_args(tbl_source, col_ind_var, col_dep_var)

     outstr_array = []
     if col_ind_var == "*":
         col_types_dict = dict(get_cols_and_types(tbl_source))
         cols = col_types_dict.keys()

         s = _string_to_array(excluded) if excluded is not None else []

         for each_col in cols:
             if each_col not in s and each_col != col_dep_var:
                 outstr_array.append(each_col)

         if not outstr_array:
             plpy.error("Elastic Net error: All columns from independent variables "
                        "have been excluded")
         elif (len(outstr_array) == 1 and
                 col_types_dict[outstr_array[0]].lower() == 'array'):
             col_ind_var = outstr_array[0]
             return analyze_single_input_str(schema_madlib, tbl_source,
                                             col_ind_var)
         else:
             included_col_types = [col_types_dict[i] for i in outstr_array]
             if not all(is_psql_numeric_type(i)
                        for i in included_col_types):
                 plpy.error("Elastic Net error: All columns to be included in the "
                            "independent variables should be of the numeric type.")
         col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]"
         return (col_ind_var_new, outstr_array)

     if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
         # if the input is a column name and not an expression
         return analyze_single_input_str(schema_madlib, tbl_source,
                                         col_ind_var, excluded)
     else:
         # if input is an expression resulting in an array output
         matched = is_string_formatted_as_array_expression(col_ind_var)
         if matched:
             # array expression starts with the word "ARRAY"
             outstr_array = _string_to_array(matched.group(1))
         else:
             # any other form of array expression
             n_feat = plpy.execute(""" SELECT array_upper({indep_var}, 1) as num_feat
                                       FROM {source} LIMIT 1
                                   """.format(indep_var=col_ind_var,
                                              source=tbl_source))[0]["num_feat"]
             outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat + 1)]
         # We allow expressions for independent variables that could start with
         # something other than 'array'
         # Example use case: input independent variable of array column
         # adding an intercept could be done as '1 || x' where 'x' is array
         # of independent variables.
         # plpy.error("Elastic Net error: Independent variable format is not quite right!")
         return (col_ind_var, outstr_array)
 # ------------------------------------------------------------------------


 def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
                              excluded=None):
     """
     Args:
         @param schema_madlib: string, Name of schema where MADlib is installed
         @param tbl_source: string, Name of input table
         @param col_ind_var: string, Name of independent variable
                             (must be single column name and of ARRAY type )
         @param excluded: list, Indices of elements to exclude.

     Returns:

     """
     if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
         # a single column is independent variable
         # which means that it is an array
         # excluded must be a string containing integers
         if not is_col_array(tbl_source, col_ind_var):
             plpy.error("Elastic Net error: The independent variable must be an array!")

         dimension = plpy.execute(
             """
             SELECT array_upper({col_ind_var}, 1) AS dimension
             FROM {tbl_source} limit 1
             """.format(tbl_source=tbl_source,
                        col_ind_var=col_ind_var))[0]["dimension"]

         if excluded is not None:
             s = _string_to_array(excluded)
             invalid_excluded = """
                                Elastic Net error: When the independent variable is
                                an array column, excluded values can only be indices
                                (i.e. between 1 and {0})""".format(dimension)
             try:
                 s = [int(i) for i in s]
             except:
                 plpy.error(invalid_excluded)
             if any(i < 1 or i > dimension for i in s):
                 plpy.error(invalid_excluded)
         else:
             s = []

         outstr_array = ["%s[%s]" % (col_ind_var, str(i))
                         for i in range(1, dimension + 1) if i not in s]
         if s:
             col_ind_var_new = "ARRAY[" + ",".join(outstr_array) + "]"
         else:
             col_ind_var_new = col_ind_var

         return (col_ind_var_new, outstr_array)
     else:
         plpy.error("Elastic Net error: Single column name included for "
                    "independent variable is not found in source table.")
 # ------------------------------------------------------------------------


 def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
                             col_id, tbl_predict, **kwargs):
     """
     Predict and put the result in a table. Useful for general CV
     """
     summary_table = add_postfix(tbl_model, "_summary")
     grouping_col = plpy.execute("SELECT grouping_col FROM {summary_table}".
                                 format(summary_table=summary_table))[0]["grouping_col"]
     with MinWarning("error"):
         regress_family = plpy.execute("SELECT family FROM {tbl_model} ".
                                       format(tbl_model=tbl_model))[0]["family"]

         if regress_family.lower() in ("gaussian", "linear"):
             predict_func = "elastic_net_gaussian_predict"
         elif regress_family.lower() in ("binomial", "logistic"):
             predict_func = "elastic_net_binomial_predict"
         else:
             plpy.error("Elastic Net error: Not a supported response family!")

         if col_id is None or col_id == '':
             plpy.error("Elastic Net error: invalid ID column provided!")
         if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib):
             elastic_net_predict_id = col_id
         else:
             elastic_net_predict_id = 'elastic_net_predict_id'

         dense_vars = plpy.execute(""" SELECT features AS fs FROM {tbl_model}
                                   """.format(tbl_model=tbl_model))[0]["fs"]
         dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]"
         # Must be careful to avoid possible name conflicts

         if grouping_col and grouping_col != 'NULL':
             qstr = """
                 CREATE TABLE {tbl_predict} AS
                     SELECT
                         {elastic_net_predict_id},
                         {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
                              AS prediction
                     FROM
                         {tbl_model} as tbl1
                         JOIN
                         (SELECT
                             {grouping_col},
                             {col_id} as {elastic_net_predict_id},
                             {dense_vars_str} as ind_var
                         FROM
                             {tbl_new_source}) tbl2
                         USING ({grouping_col})
                         ORDER BY {grouping_col}, {elastic_net_predict_id}
                 """.format(**locals())
         else:
             qstr = """
             CREATE TABLE {tbl_predict} AS
                 SELECT
                     {elastic_net_predict_id},
                     {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
                          AS prediction
                 FROM
                     {tbl_model} as tbl1,
                     (
                         SELECT
                             {col_id} as {elastic_net_predict_id},
                             {dense_vars_str} as ind_var
                         FROM
                             {tbl_new_source}
                     ) tbl2
             """.format(**locals())
         plpy.execute(qstr)
     return None
 # ------------------------------------------------------------------------