| import plpy |
| import math |
| import re |
| from utilities.utilities import _string_to_array |
| from utilities.utilities import _array_to_string |
| from convex.utils_regularization import utils_ind_var_scales |
| from convex.utils_regularization import __utils_dep_var_scale |
| from convex.utils_regularization import __utils_normalize_data |
| from convex.utils_regularization import utils_ind_var_scales_grouping |
| from convex.utils_regularization import __utils_dep_var_scale_grouping |
| from convex.utils_regularization import __utils_normalize_data_grouping |
| from utilities.validate_args import table_exists |
| |
| from collections import namedtuple |
| |
| # ------------------------------------------------------------------------ |
| # -- constants ----------------------------------------------------------- |
| |
| # below constants are defined in a manner that allow using them as enums: |
| # 'igd' in OPTIMIZERS (returns True) |
| # 'igd' == OPTIMIZERS.igd (returns True) |
| # To change the |
| BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic') |
| GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear') |
| OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista') |
| # ------------------------------------------------------------------------- |
| |
| |
| def _process_results(coef, intercept, outstr_array): |
| """ |
| Return features, features_selected, dense_coef |
| """ |
| if not outstr_array: |
| raise ValueError("Invalid feature name array: {0}".format(str(outstr_array))) |
| if not coef: |
| raise ValueError("Invalid coef array: {0}".format(str(coef))) |
| |
| features = _array_to_string(outstr_array) |
| selected_array = [] |
| dense_coef = [] |
| for i in range(len(coef)): |
| if coef[i] != 0: |
| selected_array.append(outstr_array[i]) |
| dense_coef.append(coef[i]) |
| features_selected = _array_to_string(selected_array) |
| dense_coef = _array_to_string(dense_coef) |
| |
| return (features, features_selected, dense_coef, _array_to_string(coef)) |
| # ------------------------------------------------------------------------ |
| |
| |
| def _process_warmup_lambdas(lambdas, lambda_value): |
| """ |
| Convert the string of warmup_lambdas into an double array |
| @param lambdas The string which will be converted to an array |
| @param lambda_value The target value of lambda, which must be equal to |
| the last element of the input array |
| """ |
| matched = re.match(r"^[\[\{\(](.*)[\]\}\)]$", lambdas) |
| if matched is None: |
| plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !") |
| |
| elm = _string_to_array(matched.group(1)) |
| elm = [float(i) for i in elm] |
| |
| if elm[- 1] != lambda_value: |
| plpy.error(""" |
| Elastic Net error: The last element of warmup_lambdas must |
| be equal to the lambda value that you want to compute ! |
| """) |
| |
| if len(elm) > 1: |
| for i in range(len(elm) - 1): |
| if elm[i] <= elm[i + 1]: |
| plpy.error(""" |
| Elastic Net error: The given warm-up array must be |
| in a strict descent order. |
| """) |
| |
| return elm |
| # ------------------------------------------------------------------------ |
| |
| |
| def _generate_warmup_lambda_sequence(lambda_value, n_steps): |
| """ |
| Compute lambda sequence, when warmup is True and warmup_lambdas are |
| not given |
| """ |
| if n_steps == 1: |
| return [lambda_value] |
| |
| largest = 1e5 |
| |
| if abs(lambda_value - 0.) < 1e-6: |
| zero_lambda = True |
| smallest = 0.001 * largest |
| n_steps -= 1 |
| else: |
| smallest = lambda_value |
| zero_lambda = False |
| |
| smallest, largest = min(smallest, largest), max(smallest, largest) |
| step = math.log(smallest / largest) / (float(n_steps) - 1) |
| constant = math.log(largest) |
| |
| seq = [math.exp(j * step + constant) for j in range(n_steps)] |
| if zero_lambda: |
| seq.append(0.) |
| |
| return seq |
| # ------------------------------------------------------------------------ |
| |
| def _compute_average_sq(**args): |
| """ |
| Compute the average squares of all features, used to estimtae the largest lambda |
| Actually only the largest value is used, so order does not matter here |
| """ |
| sq = [1] * args["dimension"] |
| if args["normalization"] is False: |
| for i in range(args["dimension"]): |
| sq[i] = (args["x_scales"]["std"][i]) ** 2 + (args["x_scales"]["mean"][i]) ** 2 |
| |
| return sq |
| # ------------------------------------------------------------------------ |
| |
| |
| def _compute_log_likelihood(coef, intercept, **args): |
| """ |
| Compute the log-likelihood at the end of calculation |
| """ |
| if args["family"] == "gaussian": # linear models |
| loss_query = """ |
| select |
| {method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict( |
| '{coefficients}'::double precision[], |
| {intercept}::double precision, |
| {col_ind_var_new}))^2)/({denominator}) |
| as loss |
| from |
| {tbl_used} |
| """ |
| # See jira 1094, avg experiences numerical instability |
| denominator = "2." |
| method = "avg" |
| if not args["normalization"]: |
| method = "sum" |
| denominator = "count(*) * 2." |
| loss = plpy.execute( |
| loss_query.format(coefficients=_array_to_string(coef), |
| intercept=intercept, |
| method=method, |
| denominator=denominator, |
| **args))[0]["loss"] |
| elif args["family"] == "binomial": # logistic models |
| loss = plpy.execute( |
| """ |
| select |
| avg({schema_madlib}.__elastic_net_binomial_loglikelihood( |
| '{coefficients}'::double precision[], |
| {intercept}, |
| {col_dep_var_new}, |
| {col_ind_var_new})) |
| as loss |
| from {tbl_used} |
| """.format(coefficients=_array_to_string(coef), |
| intercept=intercept, |
| **args))[0]["loss"] |
| module_1 = sum(x * x for x in coef) / 2. |
| module_2 = sum(abs(x) for x in coef) |
| |
| log_likelihood = - (loss + args["lambda_value"] * |
| ((1 - args["alpha"]) * module_1 + args["alpha"] * module_2)) |
| return log_likelihood |
| # ------------------------------------------------------------------------ |
| |
| |
| def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, |
| tbl_result, tbl_summary, lambda_value, alpha, |
| normalization, max_iter, tolerance): |
| if (any(i is None for i in (lambda_value, alpha, normalization)) or |
| any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))): |
| plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!") |
| |
| if table_exists(tbl_result, only_first_schema=True): |
| plpy.error("Elastic Net error: Output table " + tbl_result + " already exists!") |
| |
| if table_exists(tbl_summary, only_first_schema=True): |
| plpy.error("Elastic Net error: Output summary table " + tbl_summary + " already exists!") |
| |
| if lambda_value < 0: |
| plpy.error("Elastic Net error: The regularization parameter lambda cannot be negative!") |
| |
| if alpha < 0 or alpha > 1: |
| plpy.error("Elastic Net error: The elastic net control parameter alpha must be in [0,1] !") |
| |
| if max_iter <= 0: |
| plpy.error("Elastic Net error: max_iter must be positive!") |
| |
| if tolerance < 0: |
| plpy.error("Elastic Net error: tolerance must be positive!") |
| |
| return None |
| # ------------------------------------------------------------------------ |
| def _compute_scales(args): |
| if args["grouping_col"]: |
| _compute_data_scales_grouping(args) |
| else: |
| _compute_data_scales(args) |
| |
| def _compute_data_scales_grouping(args): |
| # When grouping_col is defined, we must find an array containing |
| # the mean of every dimension in the independent variable (x), the |
| # mean of dependent variable (y) and the standard deviation for them |
| # specific to groups. Store these results in temp tables x_mean_table |
| # and y_mean_table. |
| utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"], |
| args["dimension"], args["schema_madlib"], args["grouping_col"], |
| args["x_mean_table"]) |
| if args["family"] == "binomial": |
| # set mean and std to 0 and 1 respectively, for each group. |
| __utils_dep_var_scale_grouping(args["y_mean_table"], |
| args["tbl_source"], args["grouping_col"], args["family"]) |
| else: |
| __utils_dep_var_scale_grouping(args["y_mean_table"], |
| args["tbl_source"], args["grouping_col"], args["family"], |
| args["schema_madlib"], args["col_ind_var"], args["col_dep_var"]) |
| |
| def _compute_data_scales(args): |
| args["x_scales"] = utils_ind_var_scales(args["tbl_source"], |
| args["col_ind_var"], args["dimension"], args["schema_madlib"]) |
| if args["family"] == "binomial": |
| args["y_scale"] = dict(mean=0, std=1) |
| else: |
| args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"], |
| args["tbl_source"], args["col_ind_var"], args["col_dep_var"]) |
| |
| args["xmean_str"] = _array_to_string(args["x_scales"]["mean"]) |
| # ------------------------------------------------------------------------ |
| |
| |
| def _normalize_data(args): |
| """ |
| Compute the scaling factors for independent and dependent |
| variables, and then scale the original data. |
| |
| The output is stored in tbl_data_scaled |
| """ |
| y_decenter = True if args["family"] == "gaussian" else False |
| _compute_scales(args) |
| if args["grouping_col"]: |
| # When grouping_col is defined, we must find an array containing |
| # the mean of every dimension in the independent variable (x), the |
| # mean of dependent variable (y) and the standard deviation for them |
| # specific to groups. Store these results in temp tables x_mean_table |
| # and y_mean_table. |
| # __utils_normalize_data_grouping reads the various means and stds |
| # from the tables. |
| __utils_normalize_data_grouping(y_decenter=y_decenter, |
| tbl_data=args["tbl_source"], |
| col_ind_var=args["col_ind_var"], |
| col_dep_var=args["col_dep_var"], |
| tbl_data_scaled=args["tbl_data_scaled"], |
| col_ind_var_norm_new=args["col_ind_var_norm_new"], |
| col_dep_var_norm_new=args["col_dep_var_norm_new"], |
| schema_madlib=args["schema_madlib"], |
| x_mean_table=args["x_mean_table"], |
| y_mean_table=args["y_mean_table"], |
| grouping_col=args["grouping_col"]) |
| else: |
| # When no grouping_col is defined, the mean and std for both 'x' and |
| # 'y' can be defined using strings, stored in x_mean_str, x_std_str |
| # etc. We don't need a table like how we needed for grouping. |
| __utils_normalize_data(y_decenter=y_decenter, |
| tbl_data=args["tbl_source"], |
| col_ind_var=args["col_ind_var"], |
| col_dep_var=args["col_dep_var"], |
| tbl_data_scaled=args["tbl_data_scaled"], |
| col_ind_var_norm_new=args["col_ind_var_norm_new"], |
| col_dep_var_norm_new=args["col_dep_var_norm_new"], |
| schema_madlib=args["schema_madlib"], |
| x_mean_str=args["xmean_str"], |
| x_std_str=_array_to_string(args["x_scales"]["std"]), |
| y_mean=args["y_scale"]["mean"], |
| y_std=args["y_scale"]["std"], |
| grouping_col=args["grouping_col"]) |
| |
| return None |
| # ------------------------------------------------------------------------ |
| |
| def _compute_means(args): |
| """ |
| Compute the averages of dependent (y) and independent (x) variables |
| """ |
| if args["normalization"]: |
| xmean_str = _array_to_string([0] * args["dimension"]) |
| ymean = 0 |
| return (xmean_str, ymean) |
| if args["grouping_col"]: |
| # We can use the mean of the entire table instead of groups here. |
| # The absolute correct thing to do is to use group specific |
| # mean, but we will need to add a new column and change the input |
| # table contents to do that (it has to be accessed by the group |
| # iteration controller, C++ code). That is a lot more messier, |
| # so living with this approximation for now. |
| _compute_data_scales(args) |
| # If there is no grouping_col, note that _compute_data_scales() was |
| # already called, so we don't have to call it again. |
| return (args["xmean_str"], args["y_scale"]["mean"]) |
| # ------------------------------------------------------------------------ |