| |
| import plpy |
| from elastic_net_models import _elastic_net_gaussian_igd_train |
| from elastic_net_models import _elastic_net_gaussian_fista_train |
| from elastic_net_models import _elastic_net_binomial_fista_train |
| from elastic_net_models import _elastic_net_binomial_igd_train |
| from elastic_net_utils import _generate_warmup_lambda_sequence |
| |
| from elastic_net_utils import BINOMIAL_FAMILIES, GAUSSIAN_FAMILIES, OPTIMIZERS |
| |
| from utilities.validate_args import is_col_array |
| from utilities.validate_args import table_exists |
| from utilities.validate_args import table_is_empty |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import get_cols_and_types |
| from utilities.validate_args import cols_in_tbl_valid |
| from utilities.validate_args import explicit_bool_to_text |
| from utilities.utilities import extract_keyvalue_params |
| |
| from utilities.control import MinWarning |
| |
| from utilities.utilities import _string_to_array_with_quotes |
| from utilities.utilities import _string_to_array |
| from utilities.utilities import is_psql_numeric_type |
| from utilities.utilities import _assert |
| from utilities.utilities import add_postfix |
| import re |
| |
| from validation.internal.cross_validation import CrossValidator |
| # ------------------------------------------------------------------------ |
| |
| |
| def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs): |
| """ |
| Given a response family name or optimizer name, print out the related |
| information. |
| |
| If a family name is given, print out the supported optimizer together |
| with its default optimizer. |
| |
| If an optimizer name is given, print out the necessary parameters. |
| """ |
| if (family_or_optimizer is None or |
| family_or_optimizer.lower() in ("help", "?")): |
| return """ |
| ---------------------------------------------------------------- |
| Summary |
| ---------------------------------------------------------------- |
| Right now, gaussian (linear) and binomial (logistic) families |
| are supported! |
| -- |
| Run: |
| SELECT {schema_madlib}.elastic_net_train('gaussian'); |
| or |
| SELECT {schema_madlib}.elastic_net_train('binomial'); |
| to see more help. |
| -- |
| Run: SELECT {schema_madlib}.elastic_net_train('usage'); |
| to see how to use. |
| -- |
| Run: SELECT {schema_madlib}.elastic_net_train('predict'); |
| to see how to predict. |
| -- |
| Run: SELECT {schema_madlib}.elastic_net_train('example'); |
| to see some examples. |
| """.format(schema_madlib=schema_madlib) |
| |
| if (family_or_optimizer.lower() in ('example', 'examples')): |
| return """ |
| ---------------------------------------------------------------- |
| EXAMPLE |
| ---------------------------------------------------------------- |
| Create an input data set: |
| DROP TABLE IF EXISTS houses; |
| CREATE TABLE houses ( id INT, |
| tax INT, |
| bedroom INT, |
| bath FLOAT, |
| price INT, |
| size INT, |
| lot INT, |
| zipcode INT); |
| INSERT INTO houses VALUES |
| (1, 590, 2, 1, 50000, 770, 22100, 94301), |
| (2, 1050, 3, 2, 85000, 1410, 12000, 94301), |
| (3, 20, 3, 1, 22500, 1060, 3500, 94301), |
| (4, 870, 2, 2, 90000, 1300, 17500, 94301), |
| (5, 1320, 3, 2, 133000, 1500, 30000, 94301), |
| (6, 1350, 2, 1, 90500, 820, 25700, 94301), |
| (7, 2790, 3, 2.5, 260000, 2130, 25000, 94301), |
| (8, 680, 2, 1, 142500, 1170, 22000, 94301), |
| (9, 1840, 3, 2, 160000, 1500, 19000, 94301), |
| (10, 3680, 4, 2, 240000, 2790, 20000, 94301), |
| (11, 1660, 3, 1, 87000, 1030, 17500, 94301), |
| (12, 1620, 3, 2, 118600, 1250, 20000, 94301), |
| (13, 3100, 3, 2, 140000, 1760, 38000, 94301), |
| (14, 2070, 2, 3, 148000, 1550, 14000, 94301), |
| (15, 650, 3, 1.5, 65000, 1450, 12000, 94301), |
| (16, 770, 2, 2, 91000, 1300, 17500, 76010), |
| (17, 1220, 3, 2, 132300, 1500, 30000, 76010), |
| (18, 1150, 2, 1, 91100, 820, 25700, 76010), |
| (19, 2690, 3, 2.5, 260011, 2130, 25000, 76010), |
| (20, 780, 2, 1, 141800, 1170, 22000, 76010), |
| (21, 1910, 3, 2, 160900, 1500, 19000, 76010), |
| (22, 3600, 4, 2, 239000, 2790, 20000, 76010), |
| (23, 1600, 3, 1, 81010, 1030, 17500, 76010), |
| (24, 1590, 3, 2, 117910, 1250, 20000, 76010), |
| (25, 3200, 3, 2, 141100, 1760, 38000, 76010), |
| (26, 2270, 2, 3, 148011, 1550, 14000, 76010), |
| (27, 750, 3, 1.5, 66000, 1450, 12000, 76010); |
| |
| Train a model: |
| DROP TABLE IF EXISTS houses_en, houses_en_summary; |
| SELECT {schema_madlib}.elastic_net_train( |
| 'houses', -- source table |
| 'houses_en', -- result table |
| 'price', -- dependent variable |
| 'array[tax, bath, size]', -- independent variable |
| 'gaussian', -- regression family |
| 0.5, -- alpha value |
| 0.1, -- lambda value |
| TRUE, -- standardize |
| NULL, -- grouping column(s) |
| 'fista', -- optimizer |
| '', -- optimizer parameters |
| NULL, -- excluded columns |
| 10000, -- maximum iterations |
| 1e-6 -- tolerance value |
| ); |
| |
| View the resulting model: |
| \\x on |
| SELECT * FROM houses_en; |
| \\x off |
| |
| Use the prediction function to evaluate residuals: |
| SELECT id, price, predict, price - predict AS residual |
| FROM ( |
| SELECT |
| houses.*, |
| {schema_madlib}.elastic_net_gaussian_predict( |
| m.coef_all, |
| m.intercept, |
| ARRAY[tax,bath,size] |
| ) AS predict |
| FROM houses, houses_en m) s |
| ORDER BY id; |
| |
| Additional Example (with grouping): |
| DROP TABLE IF EXISTS houses_en1, houses_en1_summary; |
| SELECT {schema_madlib}.elastic_net_train( 'houses', |
| 'houses_en1', |
| 'price', |
| 'array[tax, bath, size]', |
| 'gaussian', |
| 1, |
| 30000, |
| TRUE, |
| 'zipcode', |
| 'fista', |
| '', |
| NULL, |
| 10000, |
| 1e-6 |
| ); |
| |
| View the resulting model and see a separate model for each group: |
| \\x on |
| SELECT * FROM houses_en1; |
| \\x off |
| |
| Use the prediction function to evaluate residuals: |
| SELECT {schema_madlib}.elastic_net_predict( |
| 'houses_en1', -- model table |
| 'houses', -- new source data table |
| 'id', -- unique ID associated with each row |
| 'houses_en1_prediction' -- table to store prediction result |
| ); |
| |
| View the results: |
| SELECT houses.id, |
| houses.price, |
| houses_en1_prediction.prediction, |
| houses.price - houses_en1_prediction.prediction AS residual |
| FROM houses_en1_prediction, houses |
| WHERE houses.id=houses_en1_prediction.id; |
| |
| """ |
| |
| if (family_or_optimizer.lower() in ('usage', 'help', '?')): |
| return """ |
| ---------------------------------------------------------------- |
| USAGE |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.elastic_net_train ( |
| 'tbl_source', -- Data table |
| 'tbl_result', -- Result table |
| 'col_dep_var', -- Dependent variable, can be an expression or |
| '*' |
| 'col_ind_var', -- Independent variable, can be an expression |
| 'regress_family', -- 'gaussian' (or 'linear'). 'binomial' |
| (or 'logistic') |
| alpha, -- Elastic net controlparameter, value in [0, 1] |
| lambda_value, -- Regularization parameter, positive |
| standardize, --Whether to normalize the ata |
| 'grouping_col', -- Group by which columns. (DEFAULT: NULL) |
| 'optimizer', --Name of optimizer. (DEFAUT: 'fista') |
| 'optimizer_params',-- Comma-separated string of optimizer parameters |
| 'excluded', -- Column names excluded frm '*' (DEFAULT = NULL) |
| max_iter, -- Maximum iteration numbr (DEFAULT = 1000) |
| tolerance -- Stopping criteria (DEFAULT = 1e-4) |
| ); |
| ---------------------------------------------------------------- |
| OUTPUT |
| ---------------------------------------------------------------- |
| The output table ('tbl_result' above) has the following columns: |
| grouping_col TEXT, --'Distinct values of groupng_col' |
| family TEXT, --'gaussian' or 'binomial' |
| features TEXT[], -- All feature column names |
| features_selected TEXT[], -- Features with non-zero coefficients |
| coef_nonzero DOUBLE PRECISION[], -- Non-zero coefficients |
| coef_all DOUBLE PRECISION[], -- All coefficients |
| intercept DOUBLE PRECISION, -- Intercept of the linear fit |
| log_likelihood DOUBLE PRECISION, -- log-likelihood of the fit |
| standardize BOOLEAN, -- Whether the data was standardized |
| -- before fitting |
| iteration_run INTEGER -- How many iteration was actually run |
| |
| If the independent variable is a column with type of array, features |
| and features_selected will output indices of the array. |
| """.format(schema_madlib=schema_madlib) |
| |
| if family_or_optimizer.lower() == "predict": |
| return """ |
| ---------------------------------------------------------------- |
| Prediction |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.elastic_net_predict( |
| 'regress_family', -- 'gaussian' (or 'linear'). 'binomial' |
| -- (or 'logistic') will be supported |
| coefficients, -- Fitting coefficients as a double array |
| intercept, |
| ind_var -- independent variables |
| ) FROM tbl_result, tbl_new_source; |
| |
| When predicting with binomial models, the return value is 1 |
| if the predicted result is True, and 0 if the prediction is |
| False. |
| |
| OR ------------------------------------------------------------- |
| |
| (1) SELECT {schema_madlib}.elastic_net_gaussian_predict ( |
| coefficients, intercept, ind_var |
| ) FROM tbl_result, tbl_new_source; |
| |
| (2) SELECT {schema_madlib}.elastic_net_binomial_predict ( |
| coefficients, intercept, ind_var |
| ) FROM tbl_result, tbl_new_source; |
| |
| |
| (3) SELECT {schema_madlib}.elastic_net_binomial_prob ( |
| coefficients, intercept, ind_var |
| ) FROM tbl_result, tbl_new_source; |
| |
| This returns probability values for the class being 'True'. |
| |
| OR ------------------------------------------------------------- |
| |
| SELECT {schema_madlib}.elastic_net_predict( |
| 'tbl_model', -- Result table of elastic_net_train |
| 'tbl_new_source', -- New data source |
| 'col_id', -- Unique ID column |
| 'tbl_predict' -- Prediction result |
| ); |
| will put all prediction results into a table. This can be |
| used together with cross_validation_general() function. |
| |
| When predicting with binomial models, the predicted values |
| are BOOLEAN. |
| """.format(schema_madlib=schema_madlib) |
| |
| if (family_or_optimizer.lower() in ("gaussian", "linear")): |
| return """ |
| ---------------------------------------------------------------- |
| Fitting linear models |
| ---------------------------------------------------------------- |
| Supported optimizer: |
| (1) Incremental gradient descent method ('igd') |
| (2) Fast iterative shrinkage thesholding algorithm ('fista') |
| |
| Default is 'fista' |
| -- |
| Run: |
| SELECT {schema_madlib}.elastic_net_train('optimizer'); |
| to see more help on each optimizer. |
| """.format(schema_madlib=schema_madlib) |
| |
| if (family_or_optimizer.lower() in ("binomial", "logistic")): |
| return """ |
| ---------------------------------------------------------------- |
| Fitting logistic models |
| ---------------------------------------------------------------- |
| The dependent variable must be a BOOLEAN. |
| |
| Supported optimizer: |
| (1) Incremental gradient descent method ('igd') |
| (2) Fast iterative shrinkage thesholding algorithm ('fista') |
| |
| Default is 'fista' |
| -- |
| Run: |
| SELECT {schema_madlib}.elastic_net_train('optimizer'); |
| to see more help on each optimizer. |
| """.format(schema_madlib=schema_madlib) |
| |
| if family_or_optimizer.lower() == "igd": |
| return """ |
| ---------------------------------------------------------------- |
| Incremental gradient descent (IGD) method |
| ---------------------------------------------------------------- |
| Right now, it supports fitting both linear and logistic models. |
| |
| In order to obtain sparse coefficients, a |
| modified version of IGD is actually used. |
| |
| Parameters -------------------------------- |
| stepsize - default is 0.01 |
| threshold - default is 1e-10. When a coefficient is really |
| small, set it to be 0 |
| warmup - default is False |
| warmup_lambdas - default is Null |
| warmup_lambda_no - default is 15. How many lambda's are used in |
| warm-up, will be overridden if warmup_lambdas |
| is not NULL |
| warmup_tolerance - default is the same as tolerance. The value |
| of tolerance used during warmup. |
| n_folds - default is 1, Number of cross validation folds. |
| Set this to greater than 1 if CV over lambda is required. |
| validation_result - Name of the table to store the cross validation results. |
| parallel - default is True. Run the computation on |
| multiple segments or not. |
| |
| When warmup is True or if n_folds > 1, and warmup_lambdas is NULL, a series |
| of lambda values will be automatically generated and used. |
| |
| Reference -------------------------------- |
| [1] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1 |
| Regularized Loss Minimization. Proceedings of the 26th Interna- |
| tional Conference on Machine Learning, Montreal, Canada, 2009. |
| """ |
| |
| if family_or_optimizer.lower() == "fista": |
| return """ |
| ---------------------------------------------------------------- |
| Fast iterative shrinkage thesholding algorithm |
| with backtracking for stepsizes |
| ---------------------------------------------------------------- |
| Right now, it supports fitting both linear and logistic models. |
| |
| Parameters -------------------------------- |
| max_stepsize - default is 4.0 |
| eta - default is 1.2, if stepsize does not work |
| stepsize/eta will be tried |
| warmup - default is False |
| warmup_lambdas - default is NULL, which means that lambda |
| values will be automatically generated |
| warmup_lambda_no - default is 15. How many lambda's are used in |
| warm-up, will be overridden if warmup_lambdas |
| is not NULL |
| warmup_tolerance - default is the same as tolerance. The value |
| of tolerance used during warmup. |
| use_active_set - default is False. Sometimes active-set method |
| can speed up the calculation. |
| activeset_tolerance - default is the same as tolerance. The |
| value of tolerance used during active set |
| calculation |
| random_stepsize - default is False. Whether add some randomness |
| to the step size. Sometimes, this can speed |
| up the calculation. |
| n_folds - default is 1. Number of cross validation folds. |
| Set this to greater than 1 if CV over lambda is required. |
| validation_result - Name of the table to store the cross validation results. |
| |
| When warmup is True and warmup_lambdas is NULL, warmup_lambda_no |
| of lambda values will be automatically generated and used. |
| |
| Reference -------------------------------- |
| [1] Beck, A. and M. Teboulle (2009), A fast iterative |
| shrinkage-thresholding algorothm for linear inverse |
| problems. SIAM J. on Imaging Sciences 2(1), 183-202. |
| """ |
| |
| # if family_or_optimizer.lower() == "newton": |
| # return "Newton method " |
| |
| return """ |
| Elastic Net error: Not a supported response family or optimizer |
| Run: |
| SELECT {schema_madlib}.elastic_net_train(); |
| for help |
| """.format(schema_madlib=schema_madlib) |
| # ------------------------------------------------------------------------ |
| |
| |
| def elastic_net_train(schema_madlib, source_table, model_table, dependent_varname, |
| independent_varname, regress_family, alpha, lambda_value, |
| standardize, grouping_col, optimizer, |
| optimizer_params, excluded, max_iter, tolerance, |
| **kwargs): |
| """ |
| A wrapper for all variants of elastic net regularization. |
| |
| @param source_table Name of data source table |
| @param independent_varname Name of independent variable column, |
| independent variable is an array |
| @param dependent_varname Name of dependent variable column |
| @param model_table Name of the table to store the results, |
| will return fitting coefficients and |
| likelihood |
| @param lambda_value The regularization parameter |
| @param alpha The elastic net parameter, [0, 1] |
| @param standardize Whether to normalize the variables |
| @param regress_family Response type, 'gaussian' or 'binomial' |
| @param optimizer The optimization algorithm, for example 'igd' |
| @param optimizer_params Parameters of the above optimizer, the format |
| is '{arg = value, ...}'::varchar[] |
| @param excluded Which variables are excluded when |
| independent_varname == "*" |
| """ |
| with MinWarning("warning"): |
| if regress_family is None: |
| plpy.error(""" |
| Elastic Net error: Please enter a valid response family name! |
| Run: |
| SELECT {schema_madlib}.elastic_net_train(); |
| for supported response family. |
| """.format(schema_madlib=schema_madlib)) |
| |
| if optimizer is None: |
| plpy.error(""" |
| Elastic Net error: Please enter a valid optimizer name! |
| Run: |
| SELECT {schema_madlib}.elastic_net_train('gaussian'); |
| for supported optimizers. |
| """.format(schema_madlib=schema_madlib)) |
| |
| regress_family = regress_family.lower() |
| optimizer = optimizer.lower() |
| |
| if (regress_family not in (BINOMIAL_FAMILIES + GAUSSIAN_FAMILIES) or |
| optimizer not in OPTIMIZERS): |
| plpy.error(""" |
| Elastic Net error: Not a supported response family or supported |
| optimizer of the given response family! |
| |
| Run: |
| SELECT {schema_madlib}.elastic_net_train(); |
| for help. |
| """.format(schema_madlib=schema_madlib)) |
| |
| cv_param, optimizer_params = _get_cv_optimizer_params( |
| optimizer_params, alpha, lambda_value) |
| |
| args = locals() |
| if cv_param['n_folds'] > 1: |
| args.update(cv_param) |
| _cross_validate_en(args) |
| _internal_elastic_net_train(**args) |
| # ------------------------------------------------------------------------ |
| |
| |
| def _internal_elastic_net_train( |
| schema_madlib, source_table, model_table, dependent_varname, |
| independent_varname, grouping_col, |
| regress_family, alpha, lambda_value, |
| standardize, optimizer, optimizer_params, excluded, |
| max_iter, tolerance, **kwargs): |
| |
| tbl_summary = add_postfix(model_table, "_summary") |
| |
| # handle all special cases of independent_varname |
| independent_varname, outstr_array = analyze_input_str( |
| schema_madlib, source_table, |
| independent_varname, dependent_varname, excluded) |
| |
| # get the grouping info |
| grouping_str, grouping_col = _get_grouping_col_str(schema_madlib, |
| source_table, grouping_col) |
| |
| if regress_family in GAUSSIAN_FAMILIES: |
| if optimizer == OPTIMIZERS.igd: |
| _elastic_net_gaussian_igd_train( |
| schema_madlib, source_table, independent_varname, dependent_varname, |
| model_table, tbl_summary, lambda_value, alpha, standardize, |
| optimizer_params, max_iter, tolerance, outstr_array, |
| grouping_str, grouping_col, **kwargs) |
| return None |
| if optimizer == OPTIMIZERS.fista: |
| _elastic_net_gaussian_fista_train( |
| schema_madlib, source_table, independent_varname, dependent_varname, |
| model_table, tbl_summary, lambda_value, alpha, standardize, |
| optimizer_params, max_iter, tolerance, outstr_array, |
| grouping_str, grouping_col, **kwargs) |
| return None |
| elif regress_family in BINOMIAL_FAMILIES: |
| if optimizer == OPTIMIZERS.igd: |
| dependent_varname = "(" + dependent_varname + ")::boolean" |
| _elastic_net_binomial_igd_train( |
| schema_madlib, source_table, independent_varname, dependent_varname, |
| model_table, tbl_summary, lambda_value, alpha, standardize, |
| optimizer_params, max_iter, tolerance, outstr_array, |
| grouping_str, grouping_col, **kwargs) |
| return None |
| if optimizer == OPTIMIZERS.fista: |
| dependent_varname = "(" + dependent_varname + ")::boolean" |
| _elastic_net_binomial_fista_train( |
| schema_madlib, source_table, independent_varname, dependent_varname, |
| model_table, tbl_summary, lambda_value, alpha, standardize, |
| optimizer_params, max_iter, tolerance, outstr_array, |
| grouping_str, grouping_col, **kwargs) |
| return None |
| return None |
| # ---------------------------------------------------------------------- |
| |
| |
| def _get_cv_optimizer_params(param_str, alpha, smallest_lambda): |
| cv_params_defaults = { |
| "n_folds": (1, int), |
| "lambda_value": (None, list), |
| "alpha": ([alpha], list), |
| "n_lambdas": (15, int), |
| "validation_result": (None, str) |
| } |
| param_defaults = dict([(k, v[0]) for k, v in cv_params_defaults.items()]) |
| param_types = dict([(k, v[1]) for k, v in cv_params_defaults.items()]) |
| |
| if not param_str: |
| return param_defaults, param_str |
| |
| name_value = extract_keyvalue_params(param_str, param_types, param_defaults, |
| ignore_invalid=True) |
| if name_value['n_folds'] > 1: |
| if not name_value['lambda_value']: |
| if name_value['n_lambdas']: |
| name_value['lambda_value'] = _generate_warmup_lambda_sequence( |
| smallest_lambda, name_value['n_lambdas']) |
| # no warmup when cross validating on lambda |
| param_str += ', warmup=False' |
| else: |
| name_value['lambda_value'] = [float(smallest_lambda)] |
| else: |
| name_value['lambda_value'] = map(float, name_value['lambda_value']) |
| # no warmup when cross validating on lambda |
| param_str += ', warmup=False' |
| name_value['alpha'] = map(float, name_value['alpha']) |
| return name_value, param_str |
| # ------------------------------------------------------------------------ |
| |
| |
| def _cross_validate_en(args): |
| # updating params_dict will also update args['params_dict'] |
| if args['n_folds'] > 1 and args['grouping_col']: |
| plpy.error('Elastic Net Error: cross validation with grouping is not supported!') |
| |
| allowed_cv_params = ('lambda_value', 'alpha') # keep trailing comma for single element |
| cv_params_values = {} |
| |
| for param in allowed_cv_params: |
| if isinstance(args[param], list): |
| if len(args[param]) > 1: |
| cv_params_values[param] = args[param] |
| else: |
| args[param] = args[param][0] |
| |
| if not cv_params_values and args['n_folds'] <= 1: |
| # no cross validation |
| return |
| |
| if not cv_params_values and args['n_folds'] > 1: |
| plpy.warning('Elastic Net Warning: n_folds > 1 but no ' |
| 'cross validation parameter provided') |
| return |
| |
| if cv_params_values and args['n_folds'] <= 1: |
| plpy.error('Elastic Net Error: All parameters must be scalar ' |
| 'when n_folds is 0 or 1') |
| |
| scorer = 'classification' if args['regress_family'] in BINOMIAL_FAMILIES else 'regression' |
| cv = CrossValidator(_internal_elastic_net_train, elastic_net_predict_all, scorer, args) |
| val_res = cv.validate(cv_params_values, args['n_folds']) |
| if 'validation_result' in args: |
| val_res.output_tbl(args['validation_result']) |
| args.update(val_res.top('sub_args')) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _get_grouping_col_str(schema_madlib, source_table, grouping_col): |
| if grouping_col and grouping_col.lower() != 'null': |
| cols_in_tbl_valid(source_table, |
| _string_to_array_with_quotes(grouping_col), |
| 'elastic_net') |
| intersect = frozenset( |
| _string_to_array(grouping_col)).intersection( |
| frozenset( |
| ('regress_family', 'coef_all', |
| 'features_selected', |
| 'coef_nonzero', 'intercept', |
| 'log_likelihood', 'standardize', |
| 'iteration_run'))) |
| _assert(len(intersect) == 0, |
| "Elastic Net error: Conflicting grouping column name.\n" |
| "Some predefined keyword(s) ({0}) are not allowed " |
| "for grouping column names!".format(', '.join(intersect))) |
| |
| grouping_list = [i + "::text" |
| for i in explicit_bool_to_text( |
| source_table, |
| _string_to_array_with_quotes(grouping_col), |
| schema_madlib)] |
| grouping_str = ','.join(grouping_list) |
| else: |
| grouping_str = "Null" |
| grouping_col = None |
| |
| return grouping_str, grouping_col |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _check_args(tbl_source, col_ind_var, col_dep_var): |
| """ |
| Check arguments before analyze_input_str |
| """ |
| if any(each_arg is None for each_arg in (tbl_source, col_ind_var, col_dep_var)): |
| plpy.error("Elastic Net error: You have unsupported NULL value(s) in the arguments!") |
| if any(each_arg.strip() == '' for each_arg in (tbl_source, col_ind_var, col_dep_var)): |
| plpy.error("Elastic Net error: You have unsupported EMPTY value(s) in the arguments!") |
| |
| if not table_exists(tbl_source): |
| plpy.error("Elastic Net error: Data table " + tbl_source + " does not exist!") |
| |
| if table_is_empty(tbl_source): |
| plpy.error("Elastic Net error: Data table " + tbl_source + " is empty!") |
| # ------------------------------------------------------------------------ |
| |
| |
| def analyze_input_str(schema_madlib, tbl_source, |
| col_ind_var, col_dep_var, excluded): |
| """ |
| Make input strings and output strings compatible with functions |
| |
| @param tbl_source Data table |
| @param col_ind_var Independent variables |
| @param col_dep_var Dependent variables |
| @param excluded Which variables are excluded when col_ind_var == "*" |
| """ |
| _check_args(tbl_source, col_ind_var, col_dep_var) |
| |
| outstr_array = [] |
| if col_ind_var == "*": |
| col_types_dict = dict(get_cols_and_types(tbl_source)) |
| cols = col_types_dict.keys() |
| |
| s = _string_to_array(excluded) if excluded is not None else [] |
| |
| for each_col in cols: |
| if each_col not in s and each_col != col_dep_var: |
| outstr_array.append(each_col) |
| |
| if not outstr_array: |
| plpy.error("Elastic Net error: All columns from independent variables " |
| "have been excluded") |
| elif (len(outstr_array) == 1 and |
| col_types_dict[outstr_array[0]].lower() == 'array'): |
| col_ind_var = outstr_array[0] |
| return analyze_single_input_str(schema_madlib, tbl_source, |
| col_ind_var) |
| else: |
| included_col_types = [col_types_dict[i] for i in outstr_array] |
| if not all(is_psql_numeric_type(i) |
| for i in included_col_types): |
| plpy.error("Elastic Net error: All columns to be included in the " |
| "independent variables should be of the numeric type.") |
| col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]" |
| return (col_ind_var_new, outstr_array) |
| |
| if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib): |
| # if the input is a column name and not an expression |
| return analyze_single_input_str(schema_madlib, tbl_source, |
| col_ind_var, excluded) |
| else: |
| # if input is an expression resulting in an array output |
| matched = re.match(r"(?i)^array\[(.*)\]", col_ind_var) |
| if matched: |
| # array expression starts with the word "ARRAY" |
| outstr_array = _string_to_array(matched.group(1)) |
| else: |
| # any other form of array expression |
| n_feat = plpy.execute(""" SELECT array_upper({indep_var}, 1) as num_feat |
| FROM {source} LIMIT 1 |
| """.format(indep_var=col_ind_var, |
| source=tbl_source))[0]["num_feat"] |
| outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat + 1)] |
| # We allow expressions for independent variables that could start with |
| # something other than 'array' |
| # Example use case: input independent variable of array column |
| # adding an intercept could be done as '1 || x' where 'x' is array |
| # of independent variables. |
| # plpy.error("Elastic Net error: Independent variable format is not quite right!") |
| return (col_ind_var, outstr_array) |
| # ------------------------------------------------------------------------ |
| |
| |
| def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var, |
| excluded=None): |
| """ |
| Args: |
| @param schema_madlib: string, Name of schema where MADlib is installed |
| @param tbl_source: string, Name of input table |
| @param col_ind_var: string, Name of independent variable |
| (must be single column name and of ARRAY type ) |
| @param excluded: list, Indices of elements to exclude. |
| |
| Returns: |
| |
| """ |
| if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib): |
| # a single column is independent variable |
| # which means that it is an array |
| # excluded must be a string containing integers |
| if not is_col_array(tbl_source, col_ind_var): |
| plpy.error("Elastic Net error: The independent variable must be an array!") |
| |
| dimension = plpy.execute( |
| """ |
| SELECT array_upper({col_ind_var}, 1) AS dimension |
| FROM {tbl_source} limit 1 |
| """.format(tbl_source=tbl_source, |
| col_ind_var=col_ind_var))[0]["dimension"] |
| |
| if excluded is not None: |
| s = _string_to_array(excluded) |
| invalid_excluded = """ |
| Elastic Net error: When the independent variable is |
| an array column, excluded values can only be indices |
| (i.e. between 1 and {0})""".format(dimension) |
| try: |
| s = [int(i) for i in s] |
| except: |
| plpy.error(invalid_excluded) |
| if any(i < 1 or i > dimension for i in s): |
| plpy.error(invalid_excluded) |
| else: |
| s = [] |
| |
| outstr_array = ["%s[%s]" % (col_ind_var, str(i)) |
| for i in range(1, dimension + 1) if i not in s] |
| if s: |
| col_ind_var_new = "ARRAY[" + ",".join(outstr_array) + "]" |
| else: |
| col_ind_var_new = col_ind_var |
| |
| return (col_ind_var_new, outstr_array) |
| else: |
| plpy.error("Elastic Net error: Single column name included for " |
| "independent variable is not found in source table.") |
| # ------------------------------------------------------------------------ |
| |
| |
| def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source, |
| col_id, tbl_predict, **kwargs): |
| """ |
| Predict and put the result in a table. Useful for general CV |
| """ |
| summary_table = add_postfix(tbl_model, "_summary") |
| grouping_col = plpy.execute("SELECT grouping_col FROM {summary_table}". |
| format(summary_table=summary_table))[0]["grouping_col"] |
| with MinWarning("error"): |
| regress_family = plpy.execute("SELECT family FROM {tbl_model} ". |
| format(tbl_model=tbl_model))[0]["family"] |
| |
| if regress_family.lower() in ("gaussian", "linear"): |
| predict_func = "elastic_net_gaussian_predict" |
| elif regress_family.lower() in ("binomial", "logistic"): |
| predict_func = "elastic_net_binomial_predict" |
| else: |
| plpy.error("Elastic Net error: Not a supported response family!") |
| |
| if col_id is None or col_id == '': |
| plpy.error("Elastic Net error: invalid ID column provided!") |
| if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib): |
| elastic_net_predict_id = col_id |
| else: |
| elastic_net_predict_id = 'elastic_net_predict_id' |
| |
| dense_vars = plpy.execute(""" SELECT features AS fs FROM {tbl_model} |
| """.format(tbl_model=tbl_model))[0]["fs"] |
| dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]" |
| # Must be careful to avoid possible name conflicts |
| |
| if grouping_col and grouping_col != 'NULL': |
| qstr = """ |
| CREATE TABLE {tbl_predict} AS |
| SELECT |
| {elastic_net_predict_id}, |
| {schema_madlib}.{predict_func}(coef_all, intercept, ind_var) |
| AS prediction |
| FROM |
| {tbl_model} as tbl1 |
| JOIN |
| (SELECT |
| {grouping_col}, |
| {col_id} as {elastic_net_predict_id}, |
| {dense_vars_str} as ind_var |
| FROM |
| {tbl_new_source}) tbl2 |
| USING ({grouping_col}) |
| ORDER BY {grouping_col}, {elastic_net_predict_id} |
| """.format(**locals()) |
| else: |
| qstr = """ |
| CREATE TABLE {tbl_predict} AS |
| SELECT |
| {elastic_net_predict_id}, |
| {schema_madlib}.{predict_func}(coef_all, intercept, ind_var) |
| AS prediction |
| FROM |
| {tbl_model} as tbl1, |
| ( |
| SELECT |
| {col_id} as {elastic_net_predict_id}, |
| {dense_vars_str} as ind_var |
| FROM |
| {tbl_new_source} |
| ) tbl2 |
| """.format(**locals()) |
| plpy.execute(qstr) |
| return None |
| # ------------------------------------------------------------------------ |