blob: 7cab42d87ec98386ce08c5ae1ac0b51823f02630 [file] [log] [blame]
import plpy
import math
import re
from utilities.utilities import _string_to_array
from utilities.utilities import _array_to_string
from convex.utils_regularization import utils_ind_var_scales
from convex.utils_regularization import __utils_dep_var_scale
from convex.utils_regularization import __utils_normalize_data
from convex.utils_regularization import utils_ind_var_scales_grouping
from convex.utils_regularization import __utils_dep_var_scale_grouping
from convex.utils_regularization import __utils_normalize_data_grouping
from utilities.validate_args import table_exists
from collections import namedtuple
# ------------------------------------------------------------------------
# -- constants -----------------------------------------------------------
# below constants are defined in a manner that allow using them as enums:
# 'igd' in OPTIMIZERS (returns True)
# 'igd' == OPTIMIZERS.igd (returns True)
# To change the
BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic')
GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear')
OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista')
# -------------------------------------------------------------------------
def _process_results(coef, intercept, outstr_array):
"""
Return features, features_selected, dense_coef
"""
if not outstr_array:
raise ValueError("Invalid feature name array: {0}".format(str(outstr_array)))
if not coef:
raise ValueError("Invalid coef array: {0}".format(str(coef)))
features = _array_to_string(outstr_array)
selected_array = []
dense_coef = []
for i in range(len(coef)):
if coef[i] != 0:
selected_array.append(outstr_array[i])
dense_coef.append(coef[i])
features_selected = _array_to_string(selected_array)
dense_coef = _array_to_string(dense_coef)
return (features, features_selected, dense_coef, _array_to_string(coef))
# ------------------------------------------------------------------------
def _process_warmup_lambdas(lambdas, lambda_value):
"""
Convert the string of warmup_lambdas into an double array
@param lambdas The string which will be converted to an array
@param lambda_value The target value of lambda, which must be equal to
the last element of the input array
"""
matched = re.match(r"^[\[\{\(](.*)[\]\}\)]$", lambdas)
if matched is None:
plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !")
elm = _string_to_array(matched.group(1))
elm = [float(i) for i in elm]
if elm[- 1] != lambda_value:
plpy.error("""
Elastic Net error: The last element of warmup_lambdas must
be equal to the lambda value that you want to compute !
""")
if len(elm) > 1:
for i in range(len(elm) - 1):
if elm[i] <= elm[i + 1]:
plpy.error("""
Elastic Net error: The given warm-up array must be
in a strict descent order.
""")
return elm
# ------------------------------------------------------------------------
def _generate_warmup_lambda_sequence(lambda_value, n_steps):
"""
Compute lambda sequence, when warmup is True and warmup_lambdas are
not given
"""
if n_steps == 1:
return [lambda_value]
largest = 1e5
if abs(lambda_value - 0.) < 1e-6:
zero_lambda = True
smallest = 0.001 * largest
n_steps -= 1
else:
smallest = lambda_value
zero_lambda = False
smallest, largest = min(smallest, largest), max(smallest, largest)
step = math.log(smallest / largest) / (float(n_steps) - 1)
constant = math.log(largest)
seq = [math.exp(j * step + constant) for j in range(n_steps)]
if zero_lambda:
seq.append(0.)
return seq
# ------------------------------------------------------------------------
def _compute_average_sq(**args):
"""
Compute the average squares of all features, used to estimtae the largest lambda
Actually only the largest value is used, so order does not matter here
"""
sq = [1] * args["dimension"]
if args["normalization"] is False:
for i in range(args["dimension"]):
sq[i] = (args["x_scales"]["std"][i]) ** 2 + (args["x_scales"]["mean"][i]) ** 2
return sq
# ------------------------------------------------------------------------
def _compute_log_likelihood(coef, intercept, **args):
"""
Compute the log-likelihood at the end of calculation
"""
if args["family"] == "gaussian": # linear models
loss_query = """
select
{method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
'{coefficients}'::double precision[],
{intercept}::double precision,
{col_ind_var_new}))^2)/({denominator})
as loss
from
{tbl_used}
"""
# See jira 1094, avg experiences numerical instability
denominator = "2."
method = "avg"
if not args["normalization"]:
method = "sum"
denominator = "count(*) * 2."
loss = plpy.execute(
loss_query.format(coefficients=_array_to_string(coef),
intercept=intercept,
method=method,
denominator=denominator,
**args))[0]["loss"]
elif args["family"] == "binomial": # logistic models
loss = plpy.execute(
"""
select
avg({schema_madlib}.__elastic_net_binomial_loglikelihood(
'{coefficients}'::double precision[],
{intercept},
{col_dep_var_new},
{col_ind_var_new}))
as loss
from {tbl_used}
""".format(coefficients=_array_to_string(coef),
intercept=intercept,
**args))[0]["loss"]
module_1 = sum(x * x for x in coef) / 2.
module_2 = sum(abs(x) for x in coef)
log_likelihood = - (loss + args["lambda_value"] *
((1 - args["alpha"]) * module_1 + args["alpha"] * module_2))
return log_likelihood
# ------------------------------------------------------------------------
def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
tbl_result, tbl_summary, lambda_value, alpha,
normalization, max_iter, tolerance):
if (any(i is None for i in (lambda_value, alpha, normalization)) or
any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))):
plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!")
if table_exists(tbl_result, only_first_schema=True):
plpy.error("Elastic Net error: Output table " + tbl_result + " already exists!")
if table_exists(tbl_summary, only_first_schema=True):
plpy.error("Elastic Net error: Output summary table " + tbl_summary + " already exists!")
if lambda_value < 0:
plpy.error("Elastic Net error: The regularization parameter lambda cannot be negative!")
if alpha < 0 or alpha > 1:
plpy.error("Elastic Net error: The elastic net control parameter alpha must be in [0,1] !")
if max_iter <= 0:
plpy.error("Elastic Net error: max_iter must be positive!")
if tolerance < 0:
plpy.error("Elastic Net error: tolerance must be positive!")
return None
# ------------------------------------------------------------------------
def _compute_scales(args):
if args["grouping_col"]:
_compute_data_scales_grouping(args)
else:
_compute_data_scales(args)
def _compute_data_scales_grouping(args):
# When grouping_col is defined, we must find an array containing
# the mean of every dimension in the independent variable (x), the
# mean of dependent variable (y) and the standard deviation for them
# specific to groups. Store these results in temp tables x_mean_table
# and y_mean_table.
utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
args["dimension"], args["schema_madlib"], args["grouping_col"],
args["x_mean_table"])
if args["family"] == "binomial":
# set mean and std to 0 and 1 respectively, for each group.
__utils_dep_var_scale_grouping(args["y_mean_table"],
args["tbl_source"], args["grouping_col"], args["family"])
else:
__utils_dep_var_scale_grouping(args["y_mean_table"],
args["tbl_source"], args["grouping_col"], args["family"],
args["schema_madlib"], args["col_ind_var"], args["col_dep_var"])
def _compute_data_scales(args):
args["x_scales"] = utils_ind_var_scales(args["tbl_source"],
args["col_ind_var"], args["dimension"], args["schema_madlib"])
if args["family"] == "binomial":
args["y_scale"] = dict(mean=0, std=1)
else:
args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"],
args["tbl_source"], args["col_ind_var"], args["col_dep_var"])
args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
# ------------------------------------------------------------------------
def _normalize_data(args):
"""
Compute the scaling factors for independent and dependent
variables, and then scale the original data.
The output is stored in tbl_data_scaled
"""
y_decenter = True if args["family"] == "gaussian" else False
_compute_scales(args)
if args["grouping_col"]:
# When grouping_col is defined, we must find an array containing
# the mean of every dimension in the independent variable (x), the
# mean of dependent variable (y) and the standard deviation for them
# specific to groups. Store these results in temp tables x_mean_table
# and y_mean_table.
# __utils_normalize_data_grouping reads the various means and stds
# from the tables.
__utils_normalize_data_grouping(y_decenter=y_decenter,
tbl_data=args["tbl_source"],
col_ind_var=args["col_ind_var"],
col_dep_var=args["col_dep_var"],
tbl_data_scaled=args["tbl_data_scaled"],
col_ind_var_norm_new=args["col_ind_var_norm_new"],
col_dep_var_norm_new=args["col_dep_var_norm_new"],
schema_madlib=args["schema_madlib"],
x_mean_table=args["x_mean_table"],
y_mean_table=args["y_mean_table"],
grouping_col=args["grouping_col"])
else:
# When no grouping_col is defined, the mean and std for both 'x' and
# 'y' can be defined using strings, stored in x_mean_str, x_std_str
# etc. We don't need a table like how we needed for grouping.
__utils_normalize_data(y_decenter=y_decenter,
tbl_data=args["tbl_source"],
col_ind_var=args["col_ind_var"],
col_dep_var=args["col_dep_var"],
tbl_data_scaled=args["tbl_data_scaled"],
col_ind_var_norm_new=args["col_ind_var_norm_new"],
col_dep_var_norm_new=args["col_dep_var_norm_new"],
schema_madlib=args["schema_madlib"],
x_mean_str=args["xmean_str"],
x_std_str=_array_to_string(args["x_scales"]["std"]),
y_mean=args["y_scale"]["mean"],
y_std=args["y_scale"]["std"],
grouping_col=args["grouping_col"])
return None
# ------------------------------------------------------------------------
def _compute_means(args):
"""
Compute the averages of dependent (y) and independent (x) variables
"""
if args["normalization"]:
xmean_str = _array_to_string([0] * args["dimension"])
ymean = 0
return (xmean_str, ymean)
if args["grouping_col"]:
# We can use the mean of the entire table instead of groups here.
# The absolute correct thing to do is to use group specific
# mean, but we will need to add a new column and change the input
# table contents to do that (it has to be accessed by the group
# iteration controller, C++ code). That is a lot more messier,
# so living with this approximation for now.
_compute_data_scales(args)
# If there is no grouping_col, note that _compute_data_scales() was
# already called, so we don't have to call it again.
return (args["xmean_str"], args["y_scale"]["mean"])
# ------------------------------------------------------------------------