src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in - madlib - Git at Google

 import plpy
 import math
 import re
 from utilities.utilities import _string_to_array
 from utilities.utilities import _array_to_string
 from convex.utils_regularization import utils_ind_var_scales
 from convex.utils_regularization import __utils_dep_var_scale
 from convex.utils_regularization import __utils_normalize_data
 from convex.utils_regularization import utils_ind_var_scales_grouping
 from convex.utils_regularization import __utils_dep_var_scale_grouping
 from convex.utils_regularization import __utils_normalize_data_grouping
 from utilities.validate_args import table_exists

 from collections import namedtuple

 # ------------------------------------------------------------------------
 # -- constants -----------------------------------------------------------

 # below constants are defined in a manner that allow using them as enums:
 #   'igd' in OPTIMIZERS  (returns True)
 #   'igd' == OPTIMIZERS.igd  (returns True)
 #   To change the
 BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic')
 GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear')
 OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista')
 # -------------------------------------------------------------------------


 def _process_results(coef, intercept, outstr_array):
     """
     Return features, features_selected, dense_coef
     """
     if not outstr_array:
         raise ValueError("Invalid feature name array: {0}".format(str(outstr_array)))
     if not coef:
         raise ValueError("Invalid coef array: {0}".format(str(coef)))

     features = _array_to_string(outstr_array)
     selected_array = []
     dense_coef = []
     for i in range(len(coef)):
         if coef[i] != 0:
             selected_array.append(outstr_array[i])
             dense_coef.append(coef[i])
     features_selected = _array_to_string(selected_array)
     dense_coef = _array_to_string(dense_coef)

     return (features, features_selected, dense_coef, _array_to_string(coef))
 # ------------------------------------------------------------------------


 def _process_warmup_lambdas(lambdas, lambda_value):
     """
     Convert the string of warmup_lambdas into an double array
     @param lambdas The string which will be converted to an array
     @param lambda_value The target value of lambda, which must be equal to
     the last element of the input array
     """
     matched = re.match(r"^[\[\{\(](.*)[\]\}\)]$", lambdas)
     if matched is None:
         plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !")

     elm = _string_to_array(matched.group(1))
     elm = [float(i) for i in elm]

     if elm[- 1] != lambda_value:
         plpy.error("""
                    Elastic Net error: The last element of warmup_lambdas must
                    be equal to the lambda value that you want to compute !
                    """)

     if len(elm) > 1:
         for i in range(len(elm) - 1):
             if elm[i] <= elm[i + 1]:
                 plpy.error("""
                            Elastic Net error: The given warm-up array must be
                            in a strict descent order.
                            """)

     return elm
 # ------------------------------------------------------------------------


 def _generate_warmup_lambda_sequence(lambda_value, n_steps):
     """
     Compute lambda sequence, when warmup is True and warmup_lambdas are
     not given
     """
     if n_steps == 1:
         return [lambda_value]

     largest = 1e5

     if abs(lambda_value - 0.) < 1e-6:
         zero_lambda = True
         smallest = 0.001 * largest
         n_steps -= 1
     else:
         smallest = lambda_value
         zero_lambda = False

     smallest, largest = min(smallest, largest), max(smallest, largest)
     step = math.log(smallest / largest) / (float(n_steps) - 1)
     constant = math.log(largest)

     seq = [math.exp(j * step + constant) for j in range(n_steps)]
     if zero_lambda:
         seq.append(0.)

     return seq
 # ------------------------------------------------------------------------

 def _compute_average_sq(**args):
     """
     Compute the average squares of all features, used to estimtae the largest lambda
     Actually only the largest value is used, so order does not matter here
     """
     sq = [1] * args["dimension"]
     if args["normalization"] is False:
         for i in range(args["dimension"]):
             sq[i] = (args["x_scales"]["std"][i]) ** 2 + (args["x_scales"]["mean"][i]) ** 2

     return sq
 # ------------------------------------------------------------------------


 def _compute_log_likelihood(coef, intercept, **args):
     """
     Compute the log-likelihood at the end of calculation
     """
     if args["family"] == "gaussian":  # linear models
         loss_query = """
         select
             {method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
                '{coefficients}'::double precision[],
                {intercept}::double precision,
                {col_ind_var_new}))^2)/({denominator})
             as loss
         from
             {tbl_used}
         """
         # See jira 1094, avg experiences numerical instability
         denominator = "2."
         method = "avg"
         if not args["normalization"]:
             method = "sum"
             denominator = "count(*) * 2."
         loss = plpy.execute(
             loss_query.format(coefficients=_array_to_string(coef),
                        intercept=intercept,
                        method=method,
                        denominator=denominator,
                        **args))[0]["loss"]
     elif args["family"] == "binomial":  # logistic models
         loss = plpy.execute(
             """
             select
                 avg({schema_madlib}.__elastic_net_binomial_loglikelihood(
                                     '{coefficients}'::double precision[],
                                     {intercept},
                                     {col_dep_var_new},
                                     {col_ind_var_new}))
                     as loss
             from {tbl_used}
             """.format(coefficients=_array_to_string(coef),
                        intercept=intercept,
                        **args))[0]["loss"]
     module_1 = sum(x * x for x in coef) / 2.
     module_2 = sum(abs(x) for x in coef)

     log_likelihood = - (loss + args["lambda_value"] *
                         ((1 - args["alpha"]) * module_1 + args["alpha"] * module_2))
     return log_likelihood
 # ------------------------------------------------------------------------


 def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
                                tbl_result, tbl_summary, lambda_value, alpha,
                                normalization, max_iter, tolerance):
     if (any(i is None for i in (lambda_value, alpha, normalization)) or
             any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))):
         plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!")

     if table_exists(tbl_result, only_first_schema=True):
         plpy.error("Elastic Net error: Output table " + tbl_result + " already exists!")

     if table_exists(tbl_summary, only_first_schema=True):
         plpy.error("Elastic Net error: Output summary table " + tbl_summary + " already exists!")

     if lambda_value < 0:
         plpy.error("Elastic Net error: The regularization parameter lambda cannot be negative!")

     if alpha < 0 or alpha > 1:
         plpy.error("Elastic Net error: The elastic net control parameter alpha must be in [0,1] !")

     if max_iter <= 0:
         plpy.error("Elastic Net error: max_iter must be positive!")

     if tolerance < 0:
         plpy.error("Elastic Net error: tolerance must be positive!")

     return None
 # ------------------------------------------------------------------------
 def _compute_scales(args):
     if args["grouping_col"]:
         _compute_data_scales_grouping(args)
     else:
         _compute_data_scales(args)

 def _compute_data_scales_grouping(args):
     # When grouping_col is defined, we must find an array containing
     # the mean of every dimension in the independent variable (x), the
     # mean of dependent variable (y) and the standard deviation for them
     # specific to groups. Store these results in temp tables x_mean_table
     # and y_mean_table.
     utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
         args["dimension"], args["schema_madlib"], args["grouping_col"],
         args["x_mean_table"])
     if args["family"] == "binomial":
         # set mean and std to 0 and 1 respectively, for each group.
         __utils_dep_var_scale_grouping(args["y_mean_table"],
             args["tbl_source"], args["grouping_col"], args["family"])
     else:
         __utils_dep_var_scale_grouping(args["y_mean_table"],
             args["tbl_source"], args["grouping_col"], args["family"],
             args["schema_madlib"], args["col_ind_var"], args["col_dep_var"])

 def _compute_data_scales(args):
     args["x_scales"] = utils_ind_var_scales(args["tbl_source"],
         args["col_ind_var"], args["dimension"], args["schema_madlib"])
     if args["family"] == "binomial":
         args["y_scale"] = dict(mean=0, std=1)
     else:
         args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"],
             args["tbl_source"], args["col_ind_var"], args["col_dep_var"])

     args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
 # ------------------------------------------------------------------------


 def _normalize_data(args):
     """
     Compute the scaling factors for independent and dependent
     variables, and then scale the original data.

     The output is stored in tbl_data_scaled
     """
     y_decenter = True if args["family"] == "gaussian" else False
     _compute_scales(args)
     if args["grouping_col"]:
         # When grouping_col is defined, we must find an array containing
         # the mean of every dimension in the independent variable (x), the
         # mean of dependent variable (y) and the standard deviation for them
         # specific to groups. Store these results in temp tables x_mean_table
         # and y_mean_table.
         # __utils_normalize_data_grouping reads the various means and stds
         # from the tables.
         __utils_normalize_data_grouping(y_decenter=y_decenter,
                                tbl_data=args["tbl_source"],
                                col_ind_var=args["col_ind_var"],
                                col_dep_var=args["col_dep_var"],
                                tbl_data_scaled=args["tbl_data_scaled"],
                                col_ind_var_norm_new=args["col_ind_var_norm_new"],
                                col_dep_var_norm_new=args["col_dep_var_norm_new"],
                                schema_madlib=args["schema_madlib"],
                                x_mean_table=args["x_mean_table"],
                                y_mean_table=args["y_mean_table"],
                                grouping_col=args["grouping_col"])
     else:
         # When no grouping_col is defined, the mean and std for both 'x' and
         # 'y' can be defined using strings, stored in x_mean_str, x_std_str
         # etc. We don't need a table like how we needed for grouping.
         __utils_normalize_data(y_decenter=y_decenter,
                                tbl_data=args["tbl_source"],
                                col_ind_var=args["col_ind_var"],
                                col_dep_var=args["col_dep_var"],
                                tbl_data_scaled=args["tbl_data_scaled"],
                                col_ind_var_norm_new=args["col_ind_var_norm_new"],
                                col_dep_var_norm_new=args["col_dep_var_norm_new"],
                                schema_madlib=args["schema_madlib"],
                                x_mean_str=args["xmean_str"],
                                x_std_str=_array_to_string(args["x_scales"]["std"]),
                                y_mean=args["y_scale"]["mean"],
                                y_std=args["y_scale"]["std"],
                                grouping_col=args["grouping_col"])

     return None
 # ------------------------------------------------------------------------

 def _compute_means(args):
     """
     Compute the averages of dependent (y) and independent (x) variables
     """
     if args["normalization"]:
         xmean_str = _array_to_string([0] * args["dimension"])
         ymean = 0
         return (xmean_str, ymean)
     if args["grouping_col"]:
         # We can use the mean of the entire table instead of groups here.
         # The absolute correct thing to do is to use group specific
         # mean, but we will need to add a new column and change the input
         # table contents to do that (it has to be accessed by the group
         # iteration controller, C++ code). That is a lot more messier,
         # so living with this approximation for now.
         _compute_data_scales(args)
     # If there is no grouping_col, note that _compute_data_scales() was
     # already called, so we don't have to call it again.
     return (args["xmean_str"], args["y_scale"]["mean"])
 # ------------------------------------------------------------------------
	import plpy
	import math
	import re
	from utilities.utilities import _string_to_array
	from utilities.utilities import _array_to_string
	from convex.utils_regularization import utils_ind_var_scales
	from convex.utils_regularization import __utils_dep_var_scale
	from convex.utils_regularization import __utils_normalize_data
	from convex.utils_regularization import utils_ind_var_scales_grouping
	from convex.utils_regularization import __utils_dep_var_scale_grouping
	from convex.utils_regularization import __utils_normalize_data_grouping
	from utilities.validate_args import table_exists

	from collections import namedtuple

	# ------------------------------------------------------------------------
	# -- constants -----------------------------------------------------------

	# below constants are defined in a manner that allow using them as enums:
	# 'igd' in OPTIMIZERS (returns True)
	# 'igd' == OPTIMIZERS.igd (returns True)
	# To change the
	BINOMIAL_FAMILIES = namedtuple("bin", ("binomial logistic"))('binomial', 'logistic')
	GAUSSIAN_FAMILIES = namedtuple("gau", ("gaussian linear"))('gaussian', 'linear')
	OPTIMIZERS = namedtuple("opt", ("igd fista"))('igd', 'fista')
	# -------------------------------------------------------------------------


	def _process_results(coef, intercept, outstr_array):
	"""
	Return features, features_selected, dense_coef
	"""
	if not outstr_array:
	raise ValueError("Invalid feature name array: {0}".format(str(outstr_array)))
	if not coef:
	raise ValueError("Invalid coef array: {0}".format(str(coef)))

	features = _array_to_string(outstr_array)
	selected_array = []
	dense_coef = []
	for i in range(len(coef)):
	if coef[i] != 0:
	selected_array.append(outstr_array[i])
	dense_coef.append(coef[i])
	features_selected = _array_to_string(selected_array)
	dense_coef = _array_to_string(dense_coef)

	return (features, features_selected, dense_coef, _array_to_string(coef))
	# ------------------------------------------------------------------------


	def _process_warmup_lambdas(lambdas, lambda_value):
	"""
	Convert the string of warmup_lambdas into an double array
	@param lambdas The string which will be converted to an array
	@param lambda_value The target value of lambda, which must be equal to
	the last element of the input array
	"""
	matched = re.match(r"^[\[\{\(](.*)[\]\}\)]$", lambdas)
	if matched is None:
	plpy.error("Elastic Net error: warmup_lambdas must be NULL or something like {3,2,1} !")

	elm = _string_to_array(matched.group(1))
	elm = [float(i) for i in elm]

	if elm[- 1] != lambda_value:
	plpy.error("""
	Elastic Net error: The last element of warmup_lambdas must
	be equal to the lambda value that you want to compute !
	""")

	if len(elm) > 1:
	for i in range(len(elm) - 1):
	if elm[i] <= elm[i + 1]:
	plpy.error("""
	Elastic Net error: The given warm-up array must be
	in a strict descent order.
	""")

	return elm
	# ------------------------------------------------------------------------


	def _generate_warmup_lambda_sequence(lambda_value, n_steps):
	"""
	Compute lambda sequence, when warmup is True and warmup_lambdas are
	not given
	"""
	if n_steps == 1:
	return [lambda_value]

	largest = 1e5

	if abs(lambda_value - 0.) < 1e-6:
	zero_lambda = True
	smallest = 0.001 * largest
	n_steps -= 1
	else:
	smallest = lambda_value
	zero_lambda = False

	smallest, largest = min(smallest, largest), max(smallest, largest)
	step = math.log(smallest / largest) / (float(n_steps) - 1)
	constant = math.log(largest)

	seq = [math.exp(j * step + constant) for j in range(n_steps)]
	if zero_lambda:
	seq.append(0.)

	return seq
	# ------------------------------------------------------------------------

	def _compute_average_sq(**args):
	"""
	Compute the average squares of all features, used to estimtae the largest lambda
	Actually only the largest value is used, so order does not matter here
	"""
	sq = [1] * args["dimension"]
	if args["normalization"] is False:
	for i in range(args["dimension"]):
	sq[i] = (args["x_scales"]["std"][i]) 2 + (args["x_scales"]["mean"][i]) 2

	return sq
	# ------------------------------------------------------------------------


	def _compute_log_likelihood(coef, intercept, **args):
	"""
	Compute the log-likelihood at the end of calculation
	"""
	if args["family"] == "gaussian": # linear models
	loss_query = """
	select
	{method}(({col_dep_var_new} - {schema_madlib}.elastic_net_gaussian_predict(
	'{coefficients}'::double precision[],
	{intercept}::double precision,
	{col_ind_var_new}))^2)/({denominator})
	as loss
	from
	{tbl_used}
	"""
	# See jira 1094, avg experiences numerical instability
	denominator = "2."
	method = "avg"
	if not args["normalization"]:
	method = "sum"
	denominator = "count() 2."
	loss = plpy.execute(
	loss_query.format(coefficients=_array_to_string(coef),
	intercept=intercept,
	method=method,
	denominator=denominator,
	**args))[0]["loss"]
	elif args["family"] == "binomial": # logistic models
	loss = plpy.execute(
	"""
	select
	avg({schema_madlib}.__elastic_net_binomial_loglikelihood(
	'{coefficients}'::double precision[],
	{intercept},
	{col_dep_var_new},
	{col_ind_var_new}))
	as loss
	from {tbl_used}
	""".format(coefficients=_array_to_string(coef),
	intercept=intercept,
	**args))[0]["loss"]
	module_1 = sum(x * x for x in coef) / 2.
	module_2 = sum(abs(x) for x in coef)

	log_likelihood = - (loss + args["lambda_value"] *
	((1 - args["alpha"]) * module_1 + args["alpha"] * module_2))
	return log_likelihood
	# ------------------------------------------------------------------------


	def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var,
	tbl_result, tbl_summary, lambda_value, alpha,
	normalization, max_iter, tolerance):
	if (any(i is None for i in (lambda_value, alpha, normalization)) or
	any(not i for i in (tbl_source, col_ind_var, col_dep_var, tbl_result))):
	plpy.error("Elastic Net error: You have unsupported NULL/empty value(s) in the arguments!")

	if table_exists(tbl_result, only_first_schema=True):
	plpy.error("Elastic Net error: Output table " + tbl_result + " already exists!")

	if table_exists(tbl_summary, only_first_schema=True):
	plpy.error("Elastic Net error: Output summary table " + tbl_summary + " already exists!")

	if lambda_value < 0:
	plpy.error("Elastic Net error: The regularization parameter lambda cannot be negative!")

	if alpha < 0 or alpha > 1:
	plpy.error("Elastic Net error: The elastic net control parameter alpha must be in [0,1] !")

	if max_iter <= 0:
	plpy.error("Elastic Net error: max_iter must be positive!")

	if tolerance < 0:
	plpy.error("Elastic Net error: tolerance must be positive!")

	return None
	# ------------------------------------------------------------------------
	def _compute_scales(args):
	if args["grouping_col"]:
	_compute_data_scales_grouping(args)
	else:
	_compute_data_scales(args)

	def _compute_data_scales_grouping(args):
	# When grouping_col is defined, we must find an array containing
	# the mean of every dimension in the independent variable (x), the
	# mean of dependent variable (y) and the standard deviation for them
	# specific to groups. Store these results in temp tables x_mean_table
	# and y_mean_table.
	utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"],
	args["dimension"], args["schema_madlib"], args["grouping_col"],
	args["x_mean_table"])
	if args["family"] == "binomial":
	# set mean and std to 0 and 1 respectively, for each group.
	__utils_dep_var_scale_grouping(args["y_mean_table"],
	args["tbl_source"], args["grouping_col"], args["family"])
	else:
	__utils_dep_var_scale_grouping(args["y_mean_table"],
	args["tbl_source"], args["grouping_col"], args["family"],
	args["schema_madlib"], args["col_ind_var"], args["col_dep_var"])

	def _compute_data_scales(args):
	args["x_scales"] = utils_ind_var_scales(args["tbl_source"],
	args["col_ind_var"], args["dimension"], args["schema_madlib"])
	if args["family"] == "binomial":
	args["y_scale"] = dict(mean=0, std=1)
	else:
	args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"],
	args["tbl_source"], args["col_ind_var"], args["col_dep_var"])

	args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
	# ------------------------------------------------------------------------


	def _normalize_data(args):
	"""
	Compute the scaling factors for independent and dependent
	variables, and then scale the original data.

	The output is stored in tbl_data_scaled
	"""
	y_decenter = True if args["family"] == "gaussian" else False
	_compute_scales(args)
	if args["grouping_col"]:
	# When grouping_col is defined, we must find an array containing
	# the mean of every dimension in the independent variable (x), the
	# mean of dependent variable (y) and the standard deviation for them
	# specific to groups. Store these results in temp tables x_mean_table
	# and y_mean_table.
	# __utils_normalize_data_grouping reads the various means and stds
	# from the tables.
	__utils_normalize_data_grouping(y_decenter=y_decenter,
	tbl_data=args["tbl_source"],
	col_ind_var=args["col_ind_var"],
	col_dep_var=args["col_dep_var"],
	tbl_data_scaled=args["tbl_data_scaled"],
	col_ind_var_norm_new=args["col_ind_var_norm_new"],
	col_dep_var_norm_new=args["col_dep_var_norm_new"],
	schema_madlib=args["schema_madlib"],
	x_mean_table=args["x_mean_table"],
	y_mean_table=args["y_mean_table"],
	grouping_col=args["grouping_col"])
	else:
	# When no grouping_col is defined, the mean and std for both 'x' and
	# 'y' can be defined using strings, stored in x_mean_str, x_std_str
	# etc. We don't need a table like how we needed for grouping.
	__utils_normalize_data(y_decenter=y_decenter,
	tbl_data=args["tbl_source"],
	col_ind_var=args["col_ind_var"],
	col_dep_var=args["col_dep_var"],
	tbl_data_scaled=args["tbl_data_scaled"],
	col_ind_var_norm_new=args["col_ind_var_norm_new"],
	col_dep_var_norm_new=args["col_dep_var_norm_new"],
	schema_madlib=args["schema_madlib"],
	x_mean_str=args["xmean_str"],
	x_std_str=_array_to_string(args["x_scales"]["std"]),
	y_mean=args["y_scale"]["mean"],
	y_std=args["y_scale"]["std"],
	grouping_col=args["grouping_col"])

	return None
	# ------------------------------------------------------------------------

	def _compute_means(args):
	"""
	Compute the averages of dependent (y) and independent (x) variables
	"""
	if args["normalization"]:
	xmean_str = _array_to_string([0] * args["dimension"])
	ymean = 0
	return (xmean_str, ymean)
	if args["grouping_col"]:
	# We can use the mean of the entire table instead of groups here.
	# The absolute correct thing to do is to use group specific
	# mean, but we will need to add a new column and change the input
	# table contents to do that (it has to be accessed by the group
	# iteration controller, C++ code). That is a lot more messier,
	# so living with this approximation for now.
	_compute_data_scales(args)
	# If there is no grouping_col, note that _compute_data_scales() was
	# already called, so we don't have to call it again.
	return (args["xmean_str"], args["y_scale"]["mean"])
	# ------------------------------------------------------------------------