blob: fb46ba29403cd6e77d27fc8ce7e36a46ed628d23 [file] [log] [blame]
import plpy
from elastic_net_models import _elastic_net_gaussian_igd_train
from elastic_net_models import _elastic_net_gaussian_fista_train
from elastic_net_models import _elastic_net_binomial_fista_train
from elastic_net_models import _elastic_net_binomial_igd_train
from elastic_net_utils import _generate_warmup_lambda_sequence
from elastic_net_utils import BINOMIAL_FAMILIES, GAUSSIAN_FAMILIES, OPTIMIZERS
from utilities.validate_args import is_col_array
from utilities.validate_args import table_exists
from utilities.validate_args import table_is_empty
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import get_cols_and_types
from utilities.validate_args import cols_in_tbl_valid
from utilities.validate_args import explicit_bool_to_text
from utilities.utilities import extract_keyvalue_params
from utilities.control import MinWarning
from utilities.utilities import _string_to_array_with_quotes
from utilities.utilities import _string_to_array
from utilities.utilities import is_psql_numeric_type
from utilities.utilities import _assert
from utilities.utilities import add_postfix
import re
from validation.internal.cross_validation import CrossValidator
# ------------------------------------------------------------------------
def elastic_net_help(schema_madlib, family_or_optimizer=None, **kwargs):
"""
Given a response family name or optimizer name, print out the related
information.
If a family name is given, print out the supported optimizer together
with its default optimizer.
If an optimizer name is given, print out the necessary parameters.
"""
if (family_or_optimizer is None or
family_or_optimizer.lower() in ("help", "?")):
return """
----------------------------------------------------------------
Summary
----------------------------------------------------------------
Right now, gaussian (linear) and binomial (logistic) families
are supported!
--
Run:
SELECT {schema_madlib}.elastic_net_train('gaussian');
or
SELECT {schema_madlib}.elastic_net_train('binomial');
to see more help.
--
Run: SELECT {schema_madlib}.elastic_net_train('usage');
to see how to use.
--
Run: SELECT {schema_madlib}.elastic_net_train('predict');
to see how to predict.
--
Run: SELECT {schema_madlib}.elastic_net_train('example');
to see some examples.
""".format(schema_madlib=schema_madlib)
if (family_or_optimizer.lower() in ('example', 'examples')):
return """
----------------------------------------------------------------
EXAMPLE
----------------------------------------------------------------
Create an input data set:
DROP TABLE IF EXISTS houses;
CREATE TABLE houses ( id INT,
tax INT,
bedroom INT,
bath FLOAT,
price INT,
size INT,
lot INT,
zipcode INT);
INSERT INTO houses VALUES
(1, 590, 2, 1, 50000, 770, 22100, 94301),
(2, 1050, 3, 2, 85000, 1410, 12000, 94301),
(3, 20, 3, 1, 22500, 1060, 3500, 94301),
(4, 870, 2, 2, 90000, 1300, 17500, 94301),
(5, 1320, 3, 2, 133000, 1500, 30000, 94301),
(6, 1350, 2, 1, 90500, 820, 25700, 94301),
(7, 2790, 3, 2.5, 260000, 2130, 25000, 94301),
(8, 680, 2, 1, 142500, 1170, 22000, 94301),
(9, 1840, 3, 2, 160000, 1500, 19000, 94301),
(10, 3680, 4, 2, 240000, 2790, 20000, 94301),
(11, 1660, 3, 1, 87000, 1030, 17500, 94301),
(12, 1620, 3, 2, 118600, 1250, 20000, 94301),
(13, 3100, 3, 2, 140000, 1760, 38000, 94301),
(14, 2070, 2, 3, 148000, 1550, 14000, 94301),
(15, 650, 3, 1.5, 65000, 1450, 12000, 94301),
(16, 770, 2, 2, 91000, 1300, 17500, 76010),
(17, 1220, 3, 2, 132300, 1500, 30000, 76010),
(18, 1150, 2, 1, 91100, 820, 25700, 76010),
(19, 2690, 3, 2.5, 260011, 2130, 25000, 76010),
(20, 780, 2, 1, 141800, 1170, 22000, 76010),
(21, 1910, 3, 2, 160900, 1500, 19000, 76010),
(22, 3600, 4, 2, 239000, 2790, 20000, 76010),
(23, 1600, 3, 1, 81010, 1030, 17500, 76010),
(24, 1590, 3, 2, 117910, 1250, 20000, 76010),
(25, 3200, 3, 2, 141100, 1760, 38000, 76010),
(26, 2270, 2, 3, 148011, 1550, 14000, 76010),
(27, 750, 3, 1.5, 66000, 1450, 12000, 76010);
Train a model:
DROP TABLE IF EXISTS houses_en, houses_en_summary;
SELECT {schema_madlib}.elastic_net_train(
'houses', -- source table
'houses_en', -- result table
'price', -- dependent variable
'array[tax, bath, size]', -- independent variable
'gaussian', -- regression family
0.5, -- alpha value
0.1, -- lambda value
TRUE, -- standardize
NULL, -- grouping column(s)
'fista', -- optimizer
'', -- optimizer parameters
NULL, -- excluded columns
10000, -- maximum iterations
1e-6 -- tolerance value
);
View the resulting model:
\\x on
SELECT * FROM houses_en;
\\x off
Use the prediction function to evaluate residuals:
SELECT id, price, predict, price - predict AS residual
FROM (
SELECT
houses.*,
{schema_madlib}.elastic_net_gaussian_predict(
m.coef_all,
m.intercept,
ARRAY[tax,bath,size]
) AS predict
FROM houses, houses_en m) s
ORDER BY id;
Additional Example (with grouping):
DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
SELECT {schema_madlib}.elastic_net_train( 'houses',
'houses_en1',
'price',
'array[tax, bath, size]',
'gaussian',
1,
30000,
TRUE,
'zipcode',
'fista',
'',
NULL,
10000,
1e-6
);
View the resulting model and see a separate model for each group:
\\x on
SELECT * FROM houses_en1;
\\x off
Use the prediction function to evaluate residuals:
SELECT {schema_madlib}.elastic_net_predict(
'houses_en1', -- model table
'houses', -- new source data table
'id', -- unique ID associated with each row
'houses_en1_prediction' -- table to store prediction result
);
View the results:
SELECT houses.id,
houses.price,
houses_en1_prediction.prediction,
houses.price - houses_en1_prediction.prediction AS residual
FROM houses_en1_prediction, houses
WHERE houses.id=houses_en1_prediction.id;
"""
if (family_or_optimizer.lower() in ('usage', 'help', '?')):
return """
----------------------------------------------------------------
USAGE
----------------------------------------------------------------
SELECT {schema_madlib}.elastic_net_train (
'tbl_source', -- Data table
'tbl_result', -- Result table
'col_dep_var', -- Dependent variable, can be an expression or
'*'
'col_ind_var', -- Independent variable, can be an expression
'regress_family', -- 'gaussian' (or 'linear'). 'binomial'
(or 'logistic')
alpha, -- Elastic net controlparameter, value in [0, 1]
lambda_value, -- Regularization parameter, positive
standardize, --Whether to normalize the ata
'grouping_col', -- Group by which columns. (DEFAULT: NULL)
'optimizer', --Name of optimizer. (DEFAUT: 'fista')
'optimizer_params',-- Comma-separated string of optimizer parameters
'excluded', -- Column names excluded frm '*' (DEFAULT = NULL)
max_iter, -- Maximum iteration numbr (DEFAULT = 1000)
tolerance -- Stopping criteria (DEFAULT = 1e-4)
);
----------------------------------------------------------------
OUTPUT
----------------------------------------------------------------
The output table ('tbl_result' above) has the following columns:
grouping_col TEXT, --'Distinct values of groupng_col'
family TEXT, --'gaussian' or 'binomial'
features TEXT[], -- All feature column names
features_selected TEXT[], -- Features with non-zero coefficients
coef_nonzero DOUBLE PRECISION[], -- Non-zero coefficients
coef_all DOUBLE PRECISION[], -- All coefficients
intercept DOUBLE PRECISION, -- Intercept of the linear fit
log_likelihood DOUBLE PRECISION, -- log-likelihood of the fit
standardize BOOLEAN, -- Whether the data was standardized
-- before fitting
iteration_run INTEGER -- How many iteration was actually run
If the independent variable is a column with type of array, features
and features_selected will output indices of the array.
""".format(schema_madlib=schema_madlib)
if family_or_optimizer.lower() == "predict":
return """
----------------------------------------------------------------
Prediction
----------------------------------------------------------------
SELECT {schema_madlib}.elastic_net_predict(
'regress_family', -- 'gaussian' (or 'linear'). 'binomial'
-- (or 'logistic') will be supported
coefficients, -- Fitting coefficients as a double array
intercept,
ind_var -- independent variables
) FROM tbl_result, tbl_new_source;
When predicting with binomial models, the return value is 1
if the predicted result is True, and 0 if the prediction is
False.
OR -------------------------------------------------------------
(1) SELECT {schema_madlib}.elastic_net_gaussian_predict (
coefficients, intercept, ind_var
) FROM tbl_result, tbl_new_source;
(2) SELECT {schema_madlib}.elastic_net_binomial_predict (
coefficients, intercept, ind_var
) FROM tbl_result, tbl_new_source;
(3) SELECT {schema_madlib}.elastic_net_binomial_prob (
coefficients, intercept, ind_var
) FROM tbl_result, tbl_new_source;
This returns probability values for the class being 'True'.
OR -------------------------------------------------------------
SELECT {schema_madlib}.elastic_net_predict(
'tbl_model', -- Result table of elastic_net_train
'tbl_new_source', -- New data source
'col_id', -- Unique ID column
'tbl_predict' -- Prediction result
);
will put all prediction results into a table. This can be
used together with cross_validation_general() function.
When predicting with binomial models, the predicted values
are BOOLEAN.
""".format(schema_madlib=schema_madlib)
if (family_or_optimizer.lower() in ("gaussian", "linear")):
return """
----------------------------------------------------------------
Fitting linear models
----------------------------------------------------------------
Supported optimizer:
(1) Incremental gradient descent method ('igd')
(2) Fast iterative shrinkage thesholding algorithm ('fista')
Default is 'fista'
--
Run:
SELECT {schema_madlib}.elastic_net_train('optimizer');
to see more help on each optimizer.
""".format(schema_madlib=schema_madlib)
if (family_or_optimizer.lower() in ("binomial", "logistic")):
return """
----------------------------------------------------------------
Fitting logistic models
----------------------------------------------------------------
The dependent variable must be a BOOLEAN.
Supported optimizer:
(1) Incremental gradient descent method ('igd')
(2) Fast iterative shrinkage thesholding algorithm ('fista')
Default is 'fista'
--
Run:
SELECT {schema_madlib}.elastic_net_train('optimizer');
to see more help on each optimizer.
""".format(schema_madlib=schema_madlib)
if family_or_optimizer.lower() == "igd":
return """
----------------------------------------------------------------
Incremental gradient descent (IGD) method
----------------------------------------------------------------
Right now, it supports fitting both linear and logistic models.
In order to obtain sparse coefficients, a
modified version of IGD is actually used.
Parameters --------------------------------
stepsize - default is 0.01
threshold - default is 1e-10. When a coefficient is really
small, set it to be 0
warmup - default is False
warmup_lambdas - default is Null
warmup_lambda_no - default is 15. How many lambda's are used in
warm-up, will be overridden if warmup_lambdas
is not NULL
warmup_tolerance - default is the same as tolerance. The value
of tolerance used during warmup.
n_folds - default is 1, Number of cross validation folds.
Set this to greater than 1 if CV over lambda is required.
validation_result - Name of the table to store the cross validation results.
parallel - default is True. Run the computation on
multiple segments or not.
When warmup is True or if n_folds > 1, and warmup_lambdas is NULL, a series
of lambda values will be automatically generated and used.
Reference --------------------------------
[1] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1
Regularized Loss Minimization. Proceedings of the 26th Interna-
tional Conference on Machine Learning, Montreal, Canada, 2009.
"""
if family_or_optimizer.lower() == "fista":
return """
----------------------------------------------------------------
Fast iterative shrinkage thesholding algorithm
with backtracking for stepsizes
----------------------------------------------------------------
Right now, it supports fitting both linear and logistic models.
Parameters --------------------------------
max_stepsize - default is 4.0
eta - default is 1.2, if stepsize does not work
stepsize/eta will be tried
warmup - default is False
warmup_lambdas - default is NULL, which means that lambda
values will be automatically generated
warmup_lambda_no - default is 15. How many lambda's are used in
warm-up, will be overridden if warmup_lambdas
is not NULL
warmup_tolerance - default is the same as tolerance. The value
of tolerance used during warmup.
use_active_set - default is False. Sometimes active-set method
can speed up the calculation.
activeset_tolerance - default is the same as tolerance. The
value of tolerance used during active set
calculation
random_stepsize - default is False. Whether add some randomness
to the step size. Sometimes, this can speed
up the calculation.
n_folds - default is 1. Number of cross validation folds.
Set this to greater than 1 if CV over lambda is required.
validation_result - Name of the table to store the cross validation results.
When warmup is True and warmup_lambdas is NULL, warmup_lambda_no
of lambda values will be automatically generated and used.
Reference --------------------------------
[1] Beck, A. and M. Teboulle (2009), A fast iterative
shrinkage-thresholding algorothm for linear inverse
problems. SIAM J. on Imaging Sciences 2(1), 183-202.
"""
# if family_or_optimizer.lower() == "newton":
# return "Newton method "
return """
Elastic Net error: Not a supported response family or optimizer
Run:
SELECT {schema_madlib}.elastic_net_train();
for help
""".format(schema_madlib=schema_madlib)
# ------------------------------------------------------------------------
def elastic_net_train(schema_madlib, source_table, model_table, dependent_varname,
independent_varname, regress_family, alpha, lambda_value,
standardize, grouping_col, optimizer,
optimizer_params, excluded, max_iter, tolerance,
**kwargs):
"""
A wrapper for all variants of elastic net regularization.
@param source_table Name of data source table
@param independent_varname Name of independent variable column,
independent variable is an array
@param dependent_varname Name of dependent variable column
@param model_table Name of the table to store the results,
will return fitting coefficients and
likelihood
@param lambda_value The regularization parameter
@param alpha The elastic net parameter, [0, 1]
@param standardize Whether to normalize the variables
@param regress_family Response type, 'gaussian' or 'binomial'
@param optimizer The optimization algorithm, for example 'igd'
@param optimizer_params Parameters of the above optimizer, the format
is '{arg = value, ...}'::varchar[]
@param excluded Which variables are excluded when
independent_varname == "*"
"""
with MinWarning("warning"):
if regress_family is None:
plpy.error("""
Elastic Net error: Please enter a valid response family name!
Run:
SELECT {schema_madlib}.elastic_net_train();
for supported response family.
""".format(schema_madlib=schema_madlib))
if optimizer is None:
plpy.error("""
Elastic Net error: Please enter a valid optimizer name!
Run:
SELECT {schema_madlib}.elastic_net_train('gaussian');
for supported optimizers.
""".format(schema_madlib=schema_madlib))
regress_family = regress_family.lower()
optimizer = optimizer.lower()
if (regress_family not in (BINOMIAL_FAMILIES + GAUSSIAN_FAMILIES) or
optimizer not in OPTIMIZERS):
plpy.error("""
Elastic Net error: Not a supported response family or supported
optimizer of the given response family!
Run:
SELECT {schema_madlib}.elastic_net_train();
for help.
""".format(schema_madlib=schema_madlib))
cv_param, optimizer_params = _get_cv_optimizer_params(
optimizer_params, alpha, lambda_value)
args = locals()
if cv_param['n_folds'] > 1:
args.update(cv_param)
_cross_validate_en(args)
_internal_elastic_net_train(**args)
# ------------------------------------------------------------------------
def _internal_elastic_net_train(
schema_madlib, source_table, model_table, dependent_varname,
independent_varname, grouping_col,
regress_family, alpha, lambda_value,
standardize, optimizer, optimizer_params, excluded,
max_iter, tolerance, **kwargs):
tbl_summary = add_postfix(model_table, "_summary")
# handle all special cases of independent_varname
independent_varname, outstr_array = analyze_input_str(
schema_madlib, source_table,
independent_varname, dependent_varname, excluded)
# get the grouping info
grouping_str, grouping_col = _get_grouping_col_str(schema_madlib,
source_table, grouping_col)
if regress_family in GAUSSIAN_FAMILIES:
if optimizer == OPTIMIZERS.igd:
_elastic_net_gaussian_igd_train(
schema_madlib, source_table, independent_varname, dependent_varname,
model_table, tbl_summary, lambda_value, alpha, standardize,
optimizer_params, max_iter, tolerance, outstr_array,
grouping_str, grouping_col, **kwargs)
return None
if optimizer == OPTIMIZERS.fista:
_elastic_net_gaussian_fista_train(
schema_madlib, source_table, independent_varname, dependent_varname,
model_table, tbl_summary, lambda_value, alpha, standardize,
optimizer_params, max_iter, tolerance, outstr_array,
grouping_str, grouping_col, **kwargs)
return None
elif regress_family in BINOMIAL_FAMILIES:
if optimizer == OPTIMIZERS.igd:
dependent_varname = "(" + dependent_varname + ")::boolean"
_elastic_net_binomial_igd_train(
schema_madlib, source_table, independent_varname, dependent_varname,
model_table, tbl_summary, lambda_value, alpha, standardize,
optimizer_params, max_iter, tolerance, outstr_array,
grouping_str, grouping_col, **kwargs)
return None
if optimizer == OPTIMIZERS.fista:
dependent_varname = "(" + dependent_varname + ")::boolean"
_elastic_net_binomial_fista_train(
schema_madlib, source_table, independent_varname, dependent_varname,
model_table, tbl_summary, lambda_value, alpha, standardize,
optimizer_params, max_iter, tolerance, outstr_array,
grouping_str, grouping_col, **kwargs)
return None
return None
# ----------------------------------------------------------------------
def _get_cv_optimizer_params(param_str, alpha, smallest_lambda):
cv_params_defaults = {
"n_folds": (1, int),
"lambda_value": (None, list),
"alpha": ([alpha], list),
"n_lambdas": (15, int),
"validation_result": (None, str)
}
param_defaults = dict([(k, v[0]) for k, v in cv_params_defaults.items()])
param_types = dict([(k, v[1]) for k, v in cv_params_defaults.items()])
if not param_str:
return param_defaults, param_str
name_value = extract_keyvalue_params(param_str, param_types, param_defaults,
ignore_invalid=True)
if name_value['n_folds'] > 1:
if not name_value['lambda_value']:
if name_value['n_lambdas']:
name_value['lambda_value'] = _generate_warmup_lambda_sequence(
smallest_lambda, name_value['n_lambdas'])
# no warmup when cross validating on lambda
param_str += ', warmup=False'
else:
name_value['lambda_value'] = [float(smallest_lambda)]
else:
name_value['lambda_value'] = map(float, name_value['lambda_value'])
# no warmup when cross validating on lambda
param_str += ', warmup=False'
name_value['alpha'] = map(float, name_value['alpha'])
return name_value, param_str
# ------------------------------------------------------------------------
def _cross_validate_en(args):
# updating params_dict will also update args['params_dict']
if args['n_folds'] > 1 and args['grouping_col']:
plpy.error('Elastic Net Error: cross validation with grouping is not supported!')
allowed_cv_params = ('lambda_value', 'alpha') # keep trailing comma for single element
cv_params_values = {}
for param in allowed_cv_params:
if isinstance(args[param], list):
if len(args[param]) > 1:
cv_params_values[param] = args[param]
else:
args[param] = args[param][0]
if not cv_params_values and args['n_folds'] <= 1:
# no cross validation
return
if not cv_params_values and args['n_folds'] > 1:
plpy.warning('Elastic Net Warning: n_folds > 1 but no '
'cross validation parameter provided')
return
if cv_params_values and args['n_folds'] <= 1:
plpy.error('Elastic Net Error: All parameters must be scalar '
'when n_folds is 0 or 1')
scorer = 'classification' if args['regress_family'] in BINOMIAL_FAMILIES else 'regression'
cv = CrossValidator(_internal_elastic_net_train, elastic_net_predict_all, scorer, args)
val_res = cv.validate(cv_params_values, args['n_folds'])
if 'validation_result' in args:
val_res.output_tbl(args['validation_result'])
args.update(val_res.top('sub_args'))
# ------------------------------------------------------------------------------
def _get_grouping_col_str(schema_madlib, source_table, grouping_col):
if grouping_col and grouping_col.lower() != 'null':
cols_in_tbl_valid(source_table,
_string_to_array_with_quotes(grouping_col),
'elastic_net')
intersect = frozenset(
_string_to_array(grouping_col)).intersection(
frozenset(
('regress_family', 'coef_all',
'features_selected',
'coef_nonzero', 'intercept',
'log_likelihood', 'standardize',
'iteration_run')))
_assert(len(intersect) == 0,
"Elastic Net error: Conflicting grouping column name.\n"
"Some predefined keyword(s) ({0}) are not allowed "
"for grouping column names!".format(', '.join(intersect)))
grouping_list = [i + "::text"
for i in explicit_bool_to_text(
source_table,
_string_to_array_with_quotes(grouping_col),
schema_madlib)]
grouping_str = ','.join(grouping_list)
else:
grouping_str = "Null"
grouping_col = None
return grouping_str, grouping_col
# ------------------------------------------------------------------------------
def _check_args(tbl_source, col_ind_var, col_dep_var):
"""
Check arguments before analyze_input_str
"""
if any(each_arg is None for each_arg in (tbl_source, col_ind_var, col_dep_var)):
plpy.error("Elastic Net error: You have unsupported NULL value(s) in the arguments!")
if any(each_arg.strip() == '' for each_arg in (tbl_source, col_ind_var, col_dep_var)):
plpy.error("Elastic Net error: You have unsupported EMPTY value(s) in the arguments!")
if not table_exists(tbl_source):
plpy.error("Elastic Net error: Data table " + tbl_source + " does not exist!")
if table_is_empty(tbl_source):
plpy.error("Elastic Net error: Data table " + tbl_source + " is empty!")
# ------------------------------------------------------------------------
def analyze_input_str(schema_madlib, tbl_source,
col_ind_var, col_dep_var, excluded):
"""
Make input strings and output strings compatible with functions
@param tbl_source Data table
@param col_ind_var Independent variables
@param col_dep_var Dependent variables
@param excluded Which variables are excluded when col_ind_var == "*"
"""
_check_args(tbl_source, col_ind_var, col_dep_var)
outstr_array = []
if col_ind_var == "*":
col_types_dict = dict(get_cols_and_types(tbl_source))
cols = col_types_dict.keys()
s = _string_to_array(excluded) if excluded is not None else []
for each_col in cols:
if each_col not in s and each_col != col_dep_var:
outstr_array.append(each_col)
if not outstr_array:
plpy.error("Elastic Net error: All columns from independent variables "
"have been excluded")
elif (len(outstr_array) == 1 and
col_types_dict[outstr_array[0]].lower() == 'array'):
col_ind_var = outstr_array[0]
return analyze_single_input_str(schema_madlib, tbl_source,
col_ind_var)
else:
included_col_types = [col_types_dict[i] for i in outstr_array]
if not all(is_psql_numeric_type(i)
for i in included_col_types):
plpy.error("Elastic Net error: All columns to be included in the "
"independent variables should be of the numeric type.")
col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]"
return (col_ind_var_new, outstr_array)
if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
# if the input is a column name and not an expression
return analyze_single_input_str(schema_madlib, tbl_source,
col_ind_var, excluded)
else:
# if input is an expression resulting in an array output
matched = re.match(r"(?i)^array\[(.*)\]", col_ind_var)
if matched:
# array expression starts with the word "ARRAY"
outstr_array = _string_to_array(matched.group(1))
else:
# any other form of array expression
n_feat = plpy.execute(""" SELECT array_upper({indep_var}, 1) as num_feat
FROM {source} LIMIT 1
""".format(indep_var=col_ind_var,
source=tbl_source))[0]["num_feat"]
outstr_array = ["[" + str(i) + "]" for i in range(1, n_feat + 1)]
# We allow expressions for independent variables that could start with
# something other than 'array'
# Example use case: input independent variable of array column
# adding an intercept could be done as '1 || x' where 'x' is array
# of independent variables.
# plpy.error("Elastic Net error: Independent variable format is not quite right!")
return (col_ind_var, outstr_array)
# ------------------------------------------------------------------------
def analyze_single_input_str(schema_madlib, tbl_source, col_ind_var,
excluded=None):
"""
Args:
@param schema_madlib: string, Name of schema where MADlib is installed
@param tbl_source: string, Name of input table
@param col_ind_var: string, Name of independent variable
(must be single column name and of ARRAY type )
@param excluded: list, Indices of elements to exclude.
Returns:
"""
if columns_exist_in_table(tbl_source, [col_ind_var], schema_madlib):
# a single column is independent variable
# which means that it is an array
# excluded must be a string containing integers
if not is_col_array(tbl_source, col_ind_var):
plpy.error("Elastic Net error: The independent variable must be an array!")
dimension = plpy.execute(
"""
SELECT array_upper({col_ind_var}, 1) AS dimension
FROM {tbl_source} limit 1
""".format(tbl_source=tbl_source,
col_ind_var=col_ind_var))[0]["dimension"]
if excluded is not None:
s = _string_to_array(excluded)
invalid_excluded = """
Elastic Net error: When the independent variable is
an array column, excluded values can only be indices
(i.e. between 1 and {0})""".format(dimension)
try:
s = [int(i) for i in s]
except:
plpy.error(invalid_excluded)
if any(i < 1 or i > dimension for i in s):
plpy.error(invalid_excluded)
else:
s = []
outstr_array = ["%s[%s]" % (col_ind_var, str(i))
for i in range(1, dimension + 1) if i not in s]
if s:
col_ind_var_new = "ARRAY[" + ",".join(outstr_array) + "]"
else:
col_ind_var_new = col_ind_var
return (col_ind_var_new, outstr_array)
else:
plpy.error("Elastic Net error: Single column name included for "
"independent variable is not found in source table.")
# ------------------------------------------------------------------------
def elastic_net_predict_all(schema_madlib, tbl_model, tbl_new_source,
col_id, tbl_predict, **kwargs):
"""
Predict and put the result in a table. Useful for general CV
"""
summary_table = add_postfix(tbl_model, "_summary")
grouping_col = plpy.execute("SELECT grouping_col FROM {summary_table}".
format(summary_table=summary_table))[0]["grouping_col"]
with MinWarning("error"):
regress_family = plpy.execute("SELECT family FROM {tbl_model} ".
format(tbl_model=tbl_model))[0]["family"]
if regress_family.lower() in ("gaussian", "linear"):
predict_func = "elastic_net_gaussian_predict"
elif regress_family.lower() in ("binomial", "logistic"):
predict_func = "elastic_net_binomial_predict"
else:
plpy.error("Elastic Net error: Not a supported response family!")
if col_id is None or col_id == '':
plpy.error("Elastic Net error: invalid ID column provided!")
if columns_exist_in_table(tbl_new_source, [col_id], schema_madlib):
elastic_net_predict_id = col_id
else:
elastic_net_predict_id = 'elastic_net_predict_id'
dense_vars = plpy.execute(""" SELECT features AS fs FROM {tbl_model}
""".format(tbl_model=tbl_model))[0]["fs"]
dense_vars_str = "ARRAY[" + ", ".join(dense_vars) + "]"
# Must be careful to avoid possible name conflicts
if grouping_col and grouping_col != 'NULL':
qstr = """
CREATE TABLE {tbl_predict} AS
SELECT
{elastic_net_predict_id},
{schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
AS prediction
FROM
{tbl_model} as tbl1
JOIN
(SELECT
{grouping_col},
{col_id} as {elastic_net_predict_id},
{dense_vars_str} as ind_var
FROM
{tbl_new_source}) tbl2
USING ({grouping_col})
ORDER BY {grouping_col}, {elastic_net_predict_id}
""".format(**locals())
else:
qstr = """
CREATE TABLE {tbl_predict} AS
SELECT
{elastic_net_predict_id},
{schema_madlib}.{predict_func}(coef_all, intercept, ind_var)
AS prediction
FROM
{tbl_model} as tbl1,
(
SELECT
{col_id} as {elastic_net_predict_id},
{dense_vars_str} as ind_var
FROM
{tbl_new_source}
) tbl2
""".format(**locals())
plpy.execute(qstr)
return None
# ------------------------------------------------------------------------