src/ports/postgres/modules/elastic_net/elastic_net.py_in - madlib - Git at Google


 import plpy
 from elastic_net_models import _elastic_net_gaussian_igd_train
 from elastic_net_models import _elastic_net_gaussian_fista_train
 from elastic_net_models import _elastic_net_binomial_fista_train
 from elastic_net_models import _elastic_net_binomial_igd_train
 from elastic_net_utils import _generate_warmup_lambda_sequence

 from elastic_net_utils import BINOMIAL_FAMILIES, GAUSSIAN_FAMILIES, OPTIMIZERS

 from utilities.validate_args import is_col_array
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import get_cols_and_types
 from utilities.validate_args import cols_in_tbl_valid
 from utilities.validate_args import explicit_bool_to_text
 from utilities.utilities import extract_keyvalue_params

 from utilities.control import MinWarning

 from utilities.utilities import _string_to_array_with_quotes
 from utilities.utilities import _string_to_array
 from utilities.utilities import is_psql_numeric_type
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
 import re

 from validation.internal.cross_validation import CrossValidator
 # ------------------------------------------------------------------------


 def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
     """
     Given a response family name or optimizer name, print out the related
     information.

     If a family name is given, print out the supported optimizer together
     with its default optimizer.

     If an optimizer name is given, print out the necessary parameters.
     """
     if (family_or_optimizer is None or
             family_or_optimizer.lower() in ("help", "?")):
         return """
     ----------------------------------------------------------------
                             Summary
     ----------------------------------------------------------------
     Right now, gaussian (linear) and binomial (logistic) families
     are supported!
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('gaussian');
     or
     SELECT {schema_madlib}.elastic_net_train('binomial');
     to see more help.
     --
     Run:  SELECT {schema_madlib}.elastic_net_train('usage');
     to see how to use.
     --
     Run:  SELECT {schema_madlib}.elastic_net_train('predict');
     to see how to predict.
     --
     Run:  SELECT {schema_madlib}.elastic_net_train('example');
     to see some examples.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ('example', 'examples')):
         return """
     ----------------------------------------------------------------
                             EXAMPLE
     ----------------------------------------------------------------
     Create an input data set:
     DROP TABLE IF EXISTS houses;
     CREATE TABLE houses ( id INT,
                           tax INT,
                           bedroom INT,
                           bath FLOAT,
                           price INT,
                           size INT,
                           lot INT,
                           zipcode INT);
     INSERT INTO houses VALUES
     (1,  590, 2,   1,  50000,  770, 22100, 94301),
     (2, 1050, 3,   2,  85000, 1410, 12000, 94301),
     (3,   20, 3,   1,  22500, 1060,  3500, 94301),
     (4,  870, 2,   2,  90000, 1300, 17500, 94301),
     (5, 1320, 3,   2, 133000, 1500, 30000, 94301),
     (6, 1350, 2,   1,  90500,  820, 25700, 94301),
     (7, 2790, 3, 2.5, 260000, 2130, 25000, 94301),
     (8,  680, 2,   1, 142500, 1170, 22000, 94301),
     (9, 1840, 3,   2, 160000, 1500, 19000, 94301),
     (10, 3680, 4,   2, 240000, 2790, 20000, 94301),
     (11, 1660, 3,   1,  87000, 1030, 17500, 94301),
     (12, 1620, 3,   2, 118600, 1250, 20000, 94301),
     (13, 3100, 3,   2, 140000, 1760, 38000, 94301),
     (14, 2070, 2,   3, 148000, 1550, 14000, 94301),
     (15,  650, 3, 1.5,  65000, 1450, 12000, 94301),
     (16,  770, 2,   2,  91000, 1300, 17500, 76010),
     (17, 1220, 3,   2, 132300, 1500, 30000, 76010),
     (18, 1150, 2,   1,  91100,  820, 25700, 76010),
     (19, 2690, 3, 2.5, 260011, 2130, 25000, 76010),
     (20,  780, 2,   1, 141800, 1170, 22000, 76010),
     (21, 1910, 3,   2, 160900, 1500, 19000, 76010),
     (22, 3600, 4,   2, 239000, 2790, 20000, 76010),
     (23, 1600, 3,   1,  81010, 1030, 17500, 76010),
     (24, 1590, 3,   2, 117910, 1250, 20000, 76010),
     (25, 3200, 3,   2, 141100, 1760, 38000, 76010),
     (26, 2270, 2,   3, 148011, 1550, 14000, 76010),
     (27,  750, 3, 1.5,  66000, 1450, 12000, 76010);

     Train a model:
     DROP TABLE IF EXISTS houses_en, houses_en_summary;
     SELECT {schema_madlib}.elastic_net_train(
              'houses',                  -- source table
              'houses_en',               -- result table
              'price',                   -- dependent variable
              'array[tax, bath, size]',  -- independent variable
              'gaussian',                -- regression family
              0.5,                       -- alpha value
              0.1,                       -- lambda value
              TRUE,                      -- standardize
              NULL,                      -- grouping column(s)
              'fista',                   -- optimizer
              '',                        -- optimizer parameters
              NULL,                      -- excluded columns
              10000,                     -- maximum iterations
              1e-6                       -- tolerance value
     );

     View the resulting model:
     \\x on
     SELECT * FROM houses_en;
     \\x off

     Use the prediction function to evaluate residuals:
     SELECT id, price, predict, price - predict AS residual
     FROM (
     SELECT
         houses.*,
         {schema_madlib}.elastic_net_gaussian_predict(
             m.coef_all,
             m.intercept,
             ARRAY[tax,bath,size]
             ) AS predict
     FROM houses, houses_en m) s
     ORDER BY id;

     Additional Example (with grouping):
     DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
     SELECT {schema_madlib}.elastic_net_train( 'houses',
              'houses_en1',
              'price',
              'array[tax, bath, size]',
              'gaussian',
              1,
              30000,
              TRUE,
              'zipcode',
              'fista',
              '',
              NULL,
              10000,
              1e-6
            );

     View the resulting model and see a separate model for each group:
     \\x on
     SELECT * FROM houses_en1;
     \\x off

     Use the prediction function to evaluate residuals:
     SELECT {schema_madlib}.elastic_net_predict(
             'houses_en1',            -- model table
             'houses',                -- new source data table
             'id',                    -- unique ID associated with each row
             'houses_en1_prediction'  -- table to store prediction result
           );

     View the results:
     SELECT  houses.id,
             houses.price,
             houses_en1_prediction.prediction,
             houses.price - houses_en1_prediction.prediction AS residual
     FROM houses_en1_prediction, houses
     WHERE houses.id=houses_en1_prediction.id;

         """

     if (family_or_optimizer.lower() in ('usage', 'help', '?')):
         return """
     ----------------------------------------------------------------
                             USAGE
     ----------------------------------------------------------------
     SELECT {schema_madlib}.elastic_net_train (
         'tbl_source',      -- Data table
         'tbl_result',      -- Result table
         'col_dep_var',     -- Dependent variable, can be an expression or
                                 '*'
         'col_ind_var',     -- Independent variable, can be an expression
         'regress_family',  -- 'gaussian' (or 'linear'). 'binomial'
                                 (or 'logistic')
         alpha,             -- Elastic net controlparameter, value in [0, 1]
         lambda_value,      -- Regularization parameter, positive
         standardize,       --Whether to normalize the ata
         'grouping_col',    -- Group by which columns. (DEFAULT: NULL)
         'optimizer',       --Name of optimizer. (DEFAUT: 'fista')
         'optimizer_params',-- Comma-separated string of optimizer parameters
         'excluded',        -- Column names excluded frm '*' (DEFAULT = NULL)
         max_iter,          -- Maximum iteration numbr (DEFAULT = 1000)
         tolerance          -- Stopping criteria (DEFAULT = 1e-4)
     );
     ----------------------------------------------------------------
                             OUTPUT
     ----------------------------------------------------------------
     The output table ('tbl_result' above) has the following columns:
     grouping_col      TEXT,               --'Distinct values of groupng_col'
     family            TEXT,               --'gaussian' or 'binomial'
     features          TEXT[],             -- All feature column names
     features_selected TEXT[],             -- Features with non-zero coefficients
     coef_nonzero      DOUBLE PRECISION[], -- Non-zero coefficients
     coef_all          DOUBLE PRECISION[], -- All coefficients
     intercept         DOUBLE PRECISION,   -- Intercept of the linear fit
     log_likelihood    DOUBLE PRECISION,   -- log-likelihood of the fit
     standardize       BOOLEAN,            -- Whether the data was standardized
                                           -- before fitting
     iteration_run     INTEGER             -- How many iteration was actually run

     If the independent variable is a column with type of array, features
     and features_selected will output indices of the array.
         """.format(schema_madlib=schema_madlib)

     if family_or_optimizer.lower() == "predict":
         return """
     ----------------------------------------------------------------
                             Prediction
     ----------------------------------------------------------------
     SELECT {schema_madlib}.elastic_net_predict(
         'regress_family', -- 'gaussian' (or 'linear'). 'binomial'
                           --  (or 'logistic') will be supported
         coefficients,     -- Fitting coefficients as a double array
         intercept,
         ind_var           -- independent variables
     ) FROM tbl_result, tbl_new_source;

     When predicting with binomial models, the return value is 1
     if the predicted result is True, and 0 if the prediction is
     False.

     OR -------------------------------------------------------------

     (1) SELECT {schema_madlib}.elastic_net_gaussian_predict (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;

     (2) SELECT {schema_madlib}.elastic_net_binomial_predict (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;


     (3) SELECT {schema_madlib}.elastic_net_binomial_prob (
             coefficients, intercept, ind_var
         ) FROM tbl_result, tbl_new_source;

         This returns probability values for the class being 'True'.

     OR -------------------------------------------------------------

     SELECT {schema_madlib}.elastic_net_predict(
         'tbl_model',      -- Result table of elastic_net_train
         'tbl_new_source', -- New data source
         'col_id',         -- Unique ID column
        'tbl_predict'      -- Prediction result
     );
     will put all prediction results into a table. This can be
     used together with cross_validation_general() function.

     When predicting with binomial models, the predicted values
     are BOOLEAN.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ("gaussian", "linear")):
         return """
     ----------------------------------------------------------------
     Fitting linear models
     ----------------------------------------------------------------
     Supported optimizer:
     (1) Incremental gradient descent method ('igd')
     (2) Fast iterative shrinkage thesholding algorithm ('fista')

     Default is 'fista'
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('optimizer');
     to see more help on each optimizer.
         """.format(schema_madlib=schema_madlib)

     if (family_or_optimizer.lower() in ("binomial", "logistic")):
         return """
     ----------------------------------------------------------------
     Fitting logistic models
     ----------------------------------------------------------------
     The dependent variable must be a BOOLEAN.

     Supported optimizer:
     (1) Incremental gradient descent method ('igd')
     (2) Fast iterative shrinkage thesholding algorithm ('fista')

     Default is 'fista'
     --
     Run:
     SELECT {schema_madlib}.elastic_net_train('optimizer');
     to see more help on each optimizer.
         """.format(schema_madlib=schema_madlib)

     if family_or_optimizer.lower() == "igd":
         return """
     ----------------------------------------------------------------
     Incremental gradient descent (IGD) method
     ----------------------------------------------------------------
     Right now, it supports fitting both linear and logistic models.

     In order to obtain sparse coefficients, a
     modified version of IGD is actually used.

     Parameters --------------------------------
     stepsize          - default is 0.01
     threshold         - default is 1e-10. When a coefficient is really
                         small, set it to be 0
     warmup            - default is False
     warmup_lambdas    - default is Null
     warmup_lambda_no  - default is 15. How many lambda's are used in
                         warm-up, will be overridden if warmup_lambdas
                         is not NULL
     warmup_tolerance  - default is the same as tolerance. The value
                         of tolerance used during warmup.
     n_folds           - default is 1, Number of cross validation folds.
                         Set this to greater than 1 if CV over lambda is required.
     validation_result - Name of the table to store the cross validation results.
     parallel          - default is True. Run the computation on
                         multiple segments or not.

     When warmup is True or if n_folds > 1, and warmup_lambdas is NULL, a series
     of lambda values will be automatically generated and used.

     Reference --------------------------------
     [1] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1
         Regularized Loss Minimization. Proceedings of the 26th Interna-
         tional Conference on Machine Learning, Montreal, Canada, 2009.
         """

     if family_or_optimizer.lower() == "fista":
         return """
     ----------------------------------------------------------------
     Fast iterative shrinkage thesholding algorithm
     with backtracking for stepsizes
     ----------------------------------------------------------------
     Right now, it supports fitting both linear and logistic models.

     Parameters --------------------------------
     max_stepsize        - default is 4.0
     eta                 - default is 1.2, if stepsize does not work
                           stepsize/eta will be tried
     warmup              - default is False
     warmup_lambdas      - default is NULL, which means that lambda
                           values will be automatically generated
     warmup_lambda_no    - default is 15. How many lambda's are used in
                           warm-up, will be overridden if warmup_lambdas
                           is not NULL
     warmup_tolerance    - default is the same as tolerance. The value
                           of tolerance used during warmup.
     use_active_set      - default is False. Sometimes active-set method
                           can speed up the calculation.
     activeset_tolerance - default is the same as tolerance. The
                           value of tolerance used during active set
                           calculation
     random_stepsize     - default is False. Whether add some randomness
                          to the step size. Sometimes, this can speed
                          up the calculation.
     n_folds             - default is 1. Number of cross validation folds.
                           Set this to greater than 1 if CV over lambda is required.
     validation_result   - Name of the table to store the cross validation results.

     When warmup is True and warmup_lambdas is NULL, warmup_lambda_no
     of lambda values will be automatically generated and used.

     Reference --------------------------------
     [1] Beck, A. and M. Teboulle (2009), A fast iterative
         shrinkage-thresholding algorothm for linear inverse
         problems. SIAM J. on Imaging Sciences 2(1), 183-202.
         """

     # if family_or_optimizer.lower() == "newton":
     #     return "Newton method  "

     return """
     Elastic Net error: Not a supported response family or optimizer
     Run:
     SELECT {schema_madlib}.elastic_net_train();
     for help
     """.format(schema_madlib=schema_madlib)
 # ------------------------------------------------------------------------


 def elastic_net_train(schema_madlib, source_table, model_table, dependent_varname,
                       independent_varname, regress_family, alpha, lambda_value,
                       standardize, grouping_col, optimizer,
                       optimizer_params, excluded, max_iter, tolerance,
                       **kwargs):
     """
     A wrapper for all variants of elastic net regularization.

     @param source_table        Name of data source table
     @param independent_varname       Name of independent variable column,
                              independent variable is an array
     @param dependent_varname       Name of dependent variable column
     @param model_table        Name of the table to store the results,
                              will return fitting coefficients and
                              likelihood
     @param lambda_value      The regularization parameter
     @param alpha             The elastic net parameter, [0, 1]
     @param standardize       Whether to normalize the variables
     @param regress_family    Response type, 'gaussian' or 'binomial'
     @param optimizer         The optimization algorithm, for example 'igd'
     @param optimizer_params  Parameters of the above optimizer, the format
                              is '{arg = value, ...}'::varchar[]
     @param excluded          Which variables are excluded when
                              independent_varname == "*"
     """
     with MinWarning("warning"):
         if regress_family is None:
             plpy.error("""
                        Elastic Net error: Please enter a valid response family name!
                        Run:
                        SELECT {schema_madlib}.elastic_net_train();
                        for supported response family.
                        """.format(schema_madlib=schema_madlib))

         if optimizer is None:
             plpy.error("""
                        Elastic Net error: Please enter a valid optimizer name!
                        Run:
                        SELECT {schema_madlib}.elastic_net_train('gaussian');
                        for supported optimizers.
                        """.format(schema_madlib=schema_madlib))

         regress_family = regress_family.lower()
         optimizer = optimizer.lower()

         if (regress_family not in (BINOMIAL_FAMILIES + GAUSSIAN_FAMILIES) or
                 optimizer not in OPTIMIZERS):
             plpy.error("""
                 Elastic Net error: Not a supported response family or supported
                 optimizer of the given response family!

                 Run:
                     SELECT {schema_madlib}.elastic_net_train();
                 for help.
                 """.format(schema_madlib=schema_madlib))

         cv_param, optimizer_params = _get_cv_optimizer_params(
             optimizer_params, alpha, lambda_value)

         args = locals()
         if cv_param['n_folds'] > 1:
             args.update(cv_param)
             _cross_validate_en(args)
         _internal_elastic_net_train(**args)
 # ------------------------------------------------------------------------


 def _internal_elastic_net_train(
         schema_madlib, source_table, model_table, dependent_varname,
         independent_varname, grouping_col,
         regress_family, alpha, lambda_value,
         standardize, optimizer, optimizer_params, excluded,
         max_iter, tolerance, **kwargs):

     tbl_summary = add_postfix(model_table, "_summary")

     # handle all special cases of independent_varname
     independent_varname, outstr_array = analyze_input_str(
         schema_madlib, source_table,
         independent_varname, dependent_varname, excluded)

     # get the grouping info
     grouping_str, grouping_col = _get_grouping_col_str(schema_madlib,
                                                        source_table, grouping_col)

     if regress_family in GAUSSIAN_FAMILIES:
         if optimizer == OPTIMIZERS.igd:
             _elastic_net_gaussian_igd_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
         if optimizer == OPTIMIZERS.fista:
             _elastic_net_gaussian_fista_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
     elif regress_family in BINOMIAL_FAMILIES:
         if optimizer == OPTIMIZERS.igd:
             dependent_varname = "(" + dependent_varname + ")::boolean"
             _elastic_net_binomial_igd_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
         if optimizer == OPTIMIZERS.fista:
             dependent_varname = "(" + dependent_varname + ")::boolean"
             _elastic_net_binomial_fista_train(
                 schema_madlib, source_table, independent_varname, dependent_varname,
                 model_table, tbl_summary, lambda_value, alpha, standardize,
                 optimizer_params, max_iter, tolerance, outstr_array,
                 grouping_str, grouping_col, **kwargs)
             return None
     return None
 # ----------------------------------------------------------------------


 def _get_cv_optimizer_params(param_str, alpha, smallest_lambda):
     cv_params_defaults = {
         "n_folds": (1, int),
         "lambda_value": (None, list),
         "alpha": ([alpha], list),
         "n_lambdas": (15, int),
         "validation_result": (None, str)
     }
     param_defaults = dict([(k, v[0]) for k, v in cv_params_defaults.items()])
     param_types = dict([(k, v[1]) for k, v in cv_params_defaults.items()])

     if not param_str:
         return param_defaults, param_str

     name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
                                          ignore_invalid=True)
     if name_value['n_folds'] > 1:
         if not name_value['lambda_value']:
             if name_value['n_lambdas']:
                 name_value['lambda_value'] = _generate_warmup_lambda_sequence(
                     smallest_lambda, name_value['n_lambdas'])
                 # no warmup when cross validating on lambda
                 param_str += ', warmup=False'
             else:
                 name_value['lambda_value'] = [float(smallest_lambda)]
         else:
             name_value['lambda_value'] = map(float, name_value['lambda_value'])
             # no warmup when cross validating on lambda
             param_str += ', warmup=False'
         name_value['alpha'] = map(float, name_value['alpha'])
     return name_value, param_str
 # ------------------------------------------------------------------------


 def _cross_validate_en(args):
     # updating params_dict will also update args['params_dict']
     if args['n_folds'] > 1 and args['grouping_col']:
         plpy.error('Elastic Net Error: cross validation with grouping is not supported!')

     allowed_cv_params = ('lambda_value', 'alpha')  # keep trailing comma for single element
     cv_params_values = {}

     for param in allowed_cv_params:
         if isinstance(args[param], list):
             if len(args[param]) > 1:
                 cv_params_values[param] = args[param]
             else:
                 args[param] = args[param][0]

     if not cv_params_values and args['n_folds'] <= 1:
         # no cross validation
         return

     if not cv_params_values and args['n_folds'] > 1:
         plpy.warning('Elastic Net Warning: n_folds > 1 but no '
                      'cross validation parameter provided')
         return

     if cv_params_values and args['n_folds'] <= 1:
         plpy.error('Elastic Net Error: All parameters must be scalar '
                    'when n_folds is 0 or 1')

     scorer = 'classification' if args['regress_family'] in BINOMIAL_FAMILIES else 'regression'
     cv = CrossValidator(_internal_elastic_net_train, elastic_net_predict_all, scorer, args)
     val_res = cv.validate(cv_params_values, args['n_folds'])
     if 'validation_result' in args:
         val_res.output_tbl(args['validation_result'])
     args.update(val_res.top('sub_args'))
 # ------------------------------------------------------------------------------


 def _get_grouping_col_str(schema_madlib, source_table, grouping_col):
     if grouping_col and grouping_col.lower() != 'null':
         cols_in_tbl_valid(source_table,
                           _string_to_array_with_quotes(grouping_col),
                           'elastic_net')
         intersect = frozenset(
             _string_to_array(grouping_col)).intersection(
                 frozenset(
                     ('regress_family', 'coef_all',
                      'features_selected',
                      'coef_nonzero', 'intercept',
                      'log_likelihood', 'standardize',
                      'iteration_run')))
         _assert(len(intersect) == 0,
                 "Elastic Net error: Conflicting grouping column name.\n"
                 "Some predefined keyword(s) ({0}) are not allowed "
                 "for grouping column names!".format(', '.join(intersect)))

         grouping_list = [i + "::text"
                          for i in explicit_bool_to_text(
                              source_table,
                              _string_to_array_with_quotes(grouping_col),
                              schema_madlib)]
         grouping_str = ','.join(grouping_list)
     else:
         grouping_str = "Null"
         grouping_col = None

     return grouping_str, grouping_col
 # ------------------------------------------------------------------------------


 def _check_args(tbl_source, col_ind_var, col_dep_var):
     """
     Check arguments before analyze_input_str
     """
     if any(each_arg is None for each_arg in (tbl_source, col_ind_var, col_dep_var)):
         plpy.error("Elastic Net error: You have unsupported NULL value(s) in the arguments!")
     if any(each_arg.strip() == '' for each_arg in (tbl_source, col_ind_var, col_dep_var)):
         plpy.error("Elastic Net error: You have unsupported EMPTY value(s) in the arguments!")

     if not table_exists(tbl_source):
         plpy.error("Elastic Net error: Data table " + tbl_source + " does not exist!")

     if table_is_empty(tbl_source):
         plpy.error("Elastic Net error: Data table " + tbl_source + " is empty!")
 # ------------------------------------------------------------------------


 def analyze_input_str(schema_madlib, tbl_source,
                       col_ind_var, col_dep_var, excluded):
     """
     Make input strings and output strings compatible with functions

     @param tbl_source Data table
     @param col_ind_var Independent variables
     @param col_dep_var Dependent variables
     @param excluded Which variables are excluded when col_ind_var == "*"
     """
     _check_args(tbl_source, col_ind_var, col_dep_var)

     outstr_array = []
     if col_ind_var == "*":
         col_types_dict = dict(get_cols_and_types(tbl_source))
         cols = col_types_dict.keys()

         s = _string_to_array(excluded) if excluded is not None else []

         for each_col in cols:
             if each_col not in s and each_col != col_dep_var:
                 outstr_array.append(each_col)

         if not outstr_array:
             plpy.error("Elastic Net error: All columns from independent variables "
                        "have been excluded")
         elif (len(outstr_array) == 1 and
                 col_types_dict[outstr_array[0]].lower() == 'array'):
             col_ind_var = outstr_array[0]
             return analyze_single_input_str(schema_madlib, tbl_source,
                                             col_ind_var)
         else:
             included_col_types = [col_types_dict[i] for i in outstr_array]
             if not all(is_psql_numeric_type(i)
                        for i in included_col_types):
                 plpy.error("Elastic Net error: All columns to be included in the "
                            "independent variables should be of the numeric type.")
         col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]"
         return (col_ind_var_new, outstr_array)

     if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
         # if the input is a column name and not an expression
         return analyze_single_input_str(schema_madlib, tbl_source,
                                         col_ind_var, excluded)
     else:
         # if input is an expression resulting in an array output
         matched = re.match(r"(?i)^array\[(.*)\]", col_ind_var)
         if matched:
             # array expression starts with the word "ARRAY"
             outstr_array = _string_to_array(matched.group(1))
         else:
             # any other form of array expression
             n_feat = plpy.execute(""" SELECT array_upper({indep_var}, 1) as num_feat
                                       FROM {source} LIMIT 1
                                   """.format(indep_var=col_ind_var,
                                              source=tbl_source))[0]["num_feat"]
             outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat + 1)]
         # We allow expressions for independent variables that could start with
         # something other than 'array'
         # Example use case: input independent variable of array column
         # adding an intercept could be done as '1 || x' where 'x' is array
         # of independent variables.
         # plpy.error("Elastic Net error: Independent variable format is not quite right!")
         return (col_ind_var, outstr_array)
 # ------------------------------------------------------------------------


 def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
                              excluded=None):
     """
     Args:
         @param schema_madlib: string, Name of schema where MADlib is installed
         @param tbl_source: string, Name of input table
         @param col_ind_var: string, Name of independent variable
                             (must be single column name and of ARRAY type )
         @param excluded: list, Indices of elements to exclude.

     Returns:

     """
     if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
         # a single column is independent variable
         # which means that it is an array
         # excluded must be a string containing integers
         if not is_col_array(tbl_source, col_ind_var):
             plpy.error("Elastic Net error: The independent variable must be an array!")

         dimension = plpy.execute(
             """
             SELECT array_upper({col_ind_var}, 1) AS dimension
             FROM {tbl_source} limit 1
             """.format(tbl_source=tbl_source,
                        col_ind_var=col_ind_var))[0]["dimension"]

         if excluded is not None:
             s = _string_to_array(excluded)
             invalid_excluded = """
                                Elastic Net error: When the independent variable is
                                an array column, excluded values can only be indices
                                (i.e. between 1 and {0})""".format(dimension)
             try:
                 s = [int(i) for i in s]
             except:
                 plpy.error(invalid_excluded)
             if any(i < 1 or i > dimension for i in s):
                 plpy.error(invalid_excluded)
         else:
             s = []

         outstr_array = ["%s[%s]" % (col_ind_var, str(i))
                         for i in range(1, dimension + 1) if i not in s]
         if s:
             col_ind_var_new = "ARRAY[" + ",".join(outstr_array) + "]"
         else:
             col_ind_var_new = col_ind_var

         return (col_ind_var_new, outstr_array)
     else:
         plpy.error("Elastic Net error: Single column name included for "
                    "independent variable is not found in source table.")
 # ------------------------------------------------------------------------


 def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
                             col_id, tbl_predict, **kwargs):
     """
     Predict and put the result in a table. Useful for general CV
     """
     summary_table = add_postfix(tbl_model, "_summary")
     grouping_col = plpy.execute("SELECT grouping_col FROM {summary_table}".
                                 format(summary_table=summary_table))[0]["grouping_col"]
     with MinWarning("error"):
         regress_family = plpy.execute("SELECT family FROM {tbl_model} ".
                                       format(tbl_model=tbl_model))[0]["family"]

         if regress_family.lower() in ("gaussian", "linear"):
             predict_func = "elastic_net_gaussian_predict"
         elif regress_family.lower() in ("binomial", "logistic"):
             predict_func = "elastic_net_binomial_predict"
         else:
             plpy.error("Elastic Net error: Not a supported response family!")

         if col_id is None or col_id == '':
             plpy.error("Elastic Net error: invalid ID column provided!")
         if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib):
             elastic_net_predict_id = col_id
         else:
             elastic_net_predict_id = 'elastic_net_predict_id'

         dense_vars = plpy.execute(""" SELECT features AS fs FROM {tbl_model}
                                   """.format(tbl_model=tbl_model))[0]["fs"]
         dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]"
         # Must be careful to avoid possible name conflicts

         if grouping_col and grouping_col != 'NULL':
             qstr = """
                 CREATE TABLE {tbl_predict} AS
                     SELECT
                         {elastic_net_predict_id},
                         {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
                              AS prediction
                     FROM
                         {tbl_model} as tbl1
                         JOIN
                         (SELECT
                             {grouping_col},
                             {col_id} as {elastic_net_predict_id},
                             {dense_vars_str} as ind_var
                         FROM
                             {tbl_new_source}) tbl2
                         USING ({grouping_col})
                         ORDER BY {grouping_col}, {elastic_net_predict_id}
                 """.format(**locals())
         else:
             qstr = """
             CREATE TABLE {tbl_predict} AS
                 SELECT
                     {elastic_net_predict_id},
                     {schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
                          AS prediction
                 FROM
                     {tbl_model} as tbl1,
                     (
                         SELECT
                             {col_id} as {elastic_net_predict_id},
                             {dense_vars_str} as ind_var
                         FROM
                             {tbl_new_source}
                     ) tbl2
             """.format(**locals())
         plpy.execute(qstr)
     return None
 # ------------------------------------------------------------------------