blob: b2f2505ab4670e48723463d6e37f641be9e933ad [file] [log] [blame]
import plpy
import math
import re
from utilities.utilities import _string_to_array
from utilities.utilities import _array_to_string
from convex.utils_regularization import __utils_ind_var_scales
from convex.utils_regularization import __utils_dep_var_scale
from convex.utils_regularization import __utils_normalize_data
from convex.utils_regularization import __utils_ind_var_scales_grouping
from convex.utils_regularization import __utils_dep_var_scale_grouping
from convex.utils_regularization import __utils_normalize_data_grouping
from utilities.validate_args import table_exists
from collections import namedtuple
# ------------------------------------------------------------------------
# -- constants -----------------------------------------------------------
# below constants are defined in a manner that allow using them as enums:
# 'igd' in OPTIMIZERS (returns True)
# 'igd' == OPTIMIZERS.igd (returns True)
# To change the
BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic')
GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear')
OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista')
# -------------------------------------------------------------------------
def _process_results(coef, intercept, outstr_array):
Return features, features_selected, dense_coef
if not outstr_array:
raise ValueError("Invalid feature name array: {0}".format(str(outstr_array)))
if not coef:
raise ValueError("Invalid coef array: {0}".format(str(coef)))
features = _array_to_string(outstr_array)
selected_array = []
dense_coef = []
for i in range(len(coef)):
if coef[i] != 0:
features_selected = _array_to_string(selected_array)
dense_coef = _array_to_string(dense_coef)
return (features, features_selected, dense_coef, _array_to_string(coef))
# ------------------------------------------------------------------------
def _process_warmup_lambdas(lambdas, lambda_value):
Convert the string of warmup_lambdas into an double array
@param lambdas The string which will be converted to an array
@param lambda_value The target value of lambda, which must be equal to
the last element of the input array
matched = re.match(r"^[\[\{\(](.*)[\]\}\)]$", lambdas)
if matched is None:
plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !")
elm = _string_to_array(
elm = [float(i) for i in elm]
if elm[- 1] != lambda_value:
Elastic Net error: The last element of warmup_lambdas must
be equal to the lambda value that you want to compute !
if len(elm) > 1:
for i in range(len(elm) - 1):
if elm[i] <= elm[i + 1]:
Elastic Net error: The given warm-up array must be
in a strict descent order.
return elm
# ------------------------------------------------------------------------
def _generate_warmup_lambda_sequence(lambda_value, n_steps):
Compute lambda sequence, when warmup is True and warmup_lambdas are
not given
if n_steps == 1:
return [lambda_value]
largest = 1e5
if abs(lambda_value - 0.) < 1e-6:
zero_lambda = True
smallest = 0.001 * largest
n_steps -= 1
smallest = lambda_value
zero_lambda = False
smallest, largest = min(smallest, largest), max(smallest, largest)
step = math.log(smallest / largest) / (float(n_steps) - 1)
constant = math.log(largest)
seq = [math.exp(j * step + constant) for j in range(n_steps)]
if zero_lambda:
return seq
# ------------------------------------------------------------------------
def _compute_average_sq(**args):
Compute the average squares of all features, used to estimtae the largest lambda
Actually only the largest value is used, so order does not matter here
sq = [1] * args["dimension"]
if args["normalization"] is False:
for i in range(args["dimension"]):
sq[i] = (args["x_scales"]["std"][i]) ** 2 + (args["x_scales"]["mean"][i]) ** 2
return sq
# ------------------------------------------------------------------------
def _compute_log_likelihood(coef, intercept, **args):
Compute the log-likelihood at the end of calculation
if args["family"] == "gaussian": # linear models
loss = plpy.execute(
avg(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
'{coefficients}'::double precision[],
{intercept}::double precision,
{col_ind_var_new}))^2) / 2.
as loss
elif args["family"] == "binomial": # logistic models
loss = plpy.execute(
'{coefficients}'::double precision[],
as loss
from {tbl_used}
module_1 = sum(x * x for x in coef) / 2.
module_2 = sum(abs(x) for x in coef)
log_likelihood = - (loss + args["lambda_value"] *
((1 - args["alpha"]) * module_1 + args["alpha"] * module_2))
return log_likelihood
# ------------------------------------------------------------------------
def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
tbl_result, tbl_summary, lambda_value, alpha,
normalization, max_iter, tolerance):
if (any(i is None for i in (lambda_value, alpha, normalization)) or
any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))):
plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!")
if table_exists(tbl_result, only_first_schema=True):
plpy.error("Elastic Net error: Output table " + tbl_result + " already exists!")
if table_exists(tbl_summary, only_first_schema=True):
plpy.error("Elastic Net error: Output summary table " + tbl_summary + " already exists!")
if lambda_value < 0:
plpy.error("Elastic Net error: The regularization parameter lambda cannot be negative!")
if alpha < 0 or alpha > 1:
plpy.error("Elastic Net error: The elastic net control parameter alpha must be in [0,1] !")
if max_iter <= 0:
plpy.error("Elastic Net error: max_iter must be positive!")
if tolerance < 0:
plpy.error("Elastic Net error: tolerance must be positive!")
return None
# ------------------------------------------------------------------------
def _compute_data_scales_grouping(args):
__utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
args["dimension"], args["schema_madlib"], args["grouping_col"],
if args["family"] == "binomial":
# set mean and std to 0 and 1 respectively, for each group.
args["tbl_source"], args["grouping_col"], args["family"])
args["tbl_source"], args["grouping_col"], args["family"],
args["schema_madlib"], args["col_ind_var"], args["col_dep_var"])
def _compute_data_scales(args):
args["x_scales"] = __utils_ind_var_scales(args["tbl_source"],
args["col_ind_var"], args["dimension"], args["schema_madlib"])
if args["family"] == "binomial":
args["y_scale"] = dict(mean=0, std=1)
args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"],
args["tbl_source"], args["col_ind_var"], args["col_dep_var"])
args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
# ------------------------------------------------------------------------
def _normalize_data(args):
Compute the scaling factors for independent and dependent
variables, and then scale the original data.
The output is stored in tbl_data_scaled
y_decenter = True if args["family"] == "gaussian" else False
if args["grouping_col"]:
# When grouping_col is defined, we must find an array containing
# the mean of every dimension in the independent variable (x), the
# mean of dependent variable (y) and the standard deviation for them
# specific to groups. Store these results in temp tables x_mean_table
# and y_mean_table.
# __utils_normalize_data_grouping reads the various means and stds
# from the tables.
# When no grouping_col is defined, the mean and std for both 'x' and
# 'y' can be defined using strings, stored in x_mean_str, x_std_str
# etc. We don't need a table like how we needed for grouping.
return None
# ------------------------------------------------------------------------
def _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
Measure the dimension and row number of source data table
# independent variable array length
dimension = plpy.execute("""
SELECT array_upper({col_ind_var},1) AS dimension
FROM {tbl_source} LIMIT 1
# total row number of data source table
# The WHERE clause here ignores rows in the table that contain one or more
# NULLs in the independent variable (x). There is no NULL check made for
# the dependent variable (y), since one of the hard assumptions of the
# input data to elastic_net is that the dependent variable cannot be NULL.
row_num = plpy.execute("""
SELECT COUNT(*) FROM {tbl_source}
WHERE NOT {schema_madlib}.array_contains_null({col_ind_var})
return (dimension, row_num)
# ------------------------------------------------------------------------
def _compute_means(args):
Compute the averages of dependent (y) and independent (x) variables
if args["normalization"]:
xmean_str = _array_to_string([0] * args["dimension"])
ymean = 0
return (xmean_str, ymean)
if args["grouping_col"]:
# We can use the mean of the entire table instead of groups here.
# The absolute correct thing to do is to use group specific
# mean, but we will need to add a new column and change the input
# table contents to do that (it has to be accessed by the group
# iteration controller, C++ code). That is a lot more messier,
# so living with this approximation for now.
# If there is no grouping_col, note that _compute_data_scales() was
# already called, so we don't have to call it again.
return (args["xmean_str"], args["y_scale"]["mean"])
# ------------------------------------------------------------------------