blob: 92cd280d08b0b3c4d60f323c14ab3da511820e20 [file] [log] [blame]
# coding=utf-8
"""
@file logistic.py_in
@brief Logistic Regression: Driver functions
@namespace logistic
@brief Logistic Regression: Driver functions
"""
import plpy
from utilities.group_control import GroupIterationController
from utilities.utilities import unique_string
from utilities.validate_args import table_exists
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_is_empty
from utilities.validate_args import explicit_bool_to_text
from utilities.utilities import _string_to_array
from utilities.utilities import __mad_version
from utilities.utilities import add_postfix
from utilities.utilities import _string_to_array_with_quotes
from collections import defaultdict
version_wrapper = __mad_version()
# ========================================================================
def __compute_logregr(schema_madlib, rel_args, rel_state, rel_source,
dep_col, ind_col, optimizer, grouping_col,
grouping_str, col_grp_iteration, col_grp_state, **kwargs):
"""
Compute logistic regression coefficients
This method serves as an interface to different optimization algorithms.
By default, iteratively reweighted least squares is used, but for data with
a lot of columns the conjugate-gradient method might perform better.
@param schema_madlib Name of the MADlib schema, properly escaped/quoted
@param rel_args Stores parameters that are needed by optimizers:
max_iter and tolerance
@param rel_state Store the iteration states
@param rel_source Name of relation containing the training data
@param dep_col Name of dependent column in training data (of type BOOLEAN)
@param ind_col Name of independent column in training data (of type
DOUBLE PRECISION[])
@param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
reweighted least squares, 'cg': conjugate gradient or
'igd': incremental gradient descent
@param grouping_col String of comma delimited group-by columns
@param grouping_str string of comma delimited group-by columns (casted)
@param kwargs We allow the caller to specify additional arguments (all of
which will be ignored though). The purpose of this is to allow the
caller to unpack a dictionary whose element set is a superset of
the required arguments by this function.
@return Number of iterations that has been run
"""
iterationCtrl = GroupIterationController(
rel_args=rel_args,
rel_state=rel_state,
stateType="double precision[]",
# truncAfterIteration=False,
schema_madlib=schema_madlib, # Identifiers start here
rel_source=rel_source,
ind_col=ind_col,
dep_col=dep_col,
optimizer=optimizer,
grouping_col=grouping_col,
grouping_str=grouping_str,
col_grp_iteration=col_grp_iteration,
col_grp_state=col_grp_state)
with iterationCtrl as it:
it.iteration = 0
while True:
it.update(
"""
{schema_madlib}.__logregr_{optimizer}_step(
({dep_col})::boolean,
({ind_col})::double precision[],
rel_state.{col_grp_state})
""")
if it.test(
"""
{iteration} >= _args.max_iter
or
_state_current[array_upper(_state_current, 1)] = 3
or
{schema_madlib}.__logregr_{optimizer}_step_distance(
_state_previous, _state_current) < _args.tolerance
"""):
break
return iterationCtrl.iteration
# ========================================================================
def logregr_train(
schema_madlib, source_table, out_table, dependent_varname,
independent_varname, grouping_cols=None, max_iter=None,
optimizer=None, tolerance=None, verbose=None, **kwargs):
"""
Train logistic model
@param schema_madlib Name of the MADlib schema, properly escaped/quoted
@param source_table Name of relation containing the training data
@param out_table Name of relation where model will be outputted
@param dependent_varname Name of dependent column in training data (of type BOOLEAN)
@param independent_varname Name of independent column in training data (of type
DOUBLE PRECISION[])
@param grouping_cols String of comma delimited group-by columns
@param max_iter The maximum number of iterations that are allowed.
@param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
reweighted least squares, 'cg': conjugate gradient or 'igd':
incremental gradient descent
@param tolerance The precision that the results should have
@param kwargs We allow the caller to specify additional arguments (all of
which will be ignored though). The purpose of this is to allow the
caller to unpack a dictionary whose element set is a superset of
the required arguments by this function.
@return A composite value which is __logregr_result defined in logistic.sql_in
"""
optimizer = __logregr_validate_args(
schema_madlib, source_table, out_table, dependent_varname,
independent_varname, grouping_cols, max_iter, optimizer, tolerance)
return __logregr_train_compute(
schema_madlib, source_table, out_table, dependent_varname,
independent_varname, grouping_cols, max_iter, optimizer, tolerance,
verbose, **kwargs)
# ========================================================================
def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
ind_col, grouping_col, max_iter, optimizer,
tolerance):
"""
Validate the arguments
"""
if tbl_source is None or tbl_source.strip().lower() in ('null', ''):
plpy.error("Logregr error: Invalid data table name!")
if not table_exists(tbl_source):
plpy.error("Logregr error: Data table does not exist!")
if table_is_empty(tbl_source):
plpy.error("Logregr error: Data table is empty!")
if tbl_output is None or tbl_output.strip().lower() in ('null', ''):
plpy.error("Logregr error: Invalid output table name!")
if (table_exists(tbl_output, only_first_schema=True)):
plpy.error("Output table name already exists. Drop the table before "
"calling the function.")
if not dep_col or dep_col.strip().lower() in ('null', ''):
plpy.error("Logregr error: Invalid dependent column name!")
# if not columns_exist_in_table(tbl_source, [dep_col]):
# plpy.error("Logregr error: Dependent column does not exist!")
if not ind_col or ind_col.lower() in ('null', ''):
plpy.error("Logregr error: Invalid independent column name!")
if grouping_col is not None:
if grouping_col == '':
plpy.error("Logregr error: Invalid grouping columns name!")
if not columns_exist_in_table(
tbl_source, _string_to_array(grouping_col), schema_madlib):
plpy.error("Logregr error: Grouping column does not exist!")
intersect = frozenset(_string_to_array(grouping_col)).intersection(
frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
'p_values', 'odds_ratios', 'condition_no',
'num_processed', 'num_missing_rows_skipped',
'variance_covariance')))
if len(intersect) > 0:
plpy.error("Logregr error: Conflicted grouping column name.\n"
"Predefined name(s) {0} are not allow!".format(
', '.join(intersect)))
if max_iter <= 0:
plpy.error("Logregr error: Maximum number of iterations must be positive!")
if tolerance < 0:
plpy.error("Logregr error: The tolerance cannot be negative!")
if optimizer == "newton":
optimizer = "irls"
elif optimizer not in ("irls", "cg", "igd"):
plpy.error(""" Logregr error: Unknown optimizer requested.
Must be 'newton'/'irls', 'cg', or 'igd'.
""")
return optimizer
# ========================================================================
def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
ind_col, grouping_col, max_iter, optimizer,
tolerance, verbose, **kwargs):
"""
Create an output table (drop if exists) that contains the logistic
regression model
"""
old_msg_level = plpy.execute("select setting from pg_settings where \
name='client_min_messages'")[0]['setting']
if verbose:
plpy.execute("set client_min_messages to warning")
else:
plpy.execute("set client_min_messages to error")
args = {'schema_madlib': schema_madlib,
'tbl_source': tbl_source,
'tbl_output': tbl_output,
'dep_col': dep_col,
'ind_col': ind_col,
'max_iter': max_iter,
'optimizer': optimizer,
'tolerance': tolerance,
'tbl_logregr_args': unique_string(),
'tbl_logregr_state': unique_string(),
'col_grp_iteration': unique_string(),
'col_grp_state': unique_string(),
'irls': "__logregr_irls_result",
'newton': "__logregr_irls_result",
'cg': "__logregr_cg_result",
'igd': "__logregr_igd_result"}
plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".
format(**args))
plpy.execute("""
CREATE TABLE pg_temp.{tbl_logregr_args} as
SELECT
{max_iter} as max_iter,
{tolerance} as tolerance
""".format(**args))
# return an array of dict
# each dict has two elements: iteration number, and grouping value array
if grouping_col:
grouping_list = explicit_bool_to_text(
tbl_source, _string_to_array_with_quotes(grouping_col), schema_madlib)
for i in range(len(grouping_list)):
grouping_list[i] += "::text"
grouping_str = ','.join(grouping_list)
else:
grouping_str = "Null"
iteration_run = __compute_logregr(schema_madlib, args["tbl_logregr_args"],
args["tbl_logregr_state"], tbl_source,
dep_col, ind_col, optimizer,
grouping_col=grouping_col,
grouping_str=grouping_str,
col_grp_iteration=args[
"col_grp_iteration"],
col_grp_state=args["col_grp_state"])
grouping_str1 = "" if grouping_col is None else grouping_col + ","
grouping_str2 = "1 = 1" if grouping_col is None else grouping_col
using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
join_str = "," if grouping_str1 == "" else "join "
plpy.execute("""
CREATE TABLE {tbl_output} as
SELECT
{grouping_str1}
(CASE WHEN (result).status = 1 THEN (result).coef
ELSE NULL::double precision[] END) as coef,
(CASE WHEN (result).status = 1 THEN (result).log_likelihood
ELSE NULL::double precision END) as log_likelihood,
(CASE WHEN (result).status = 1 THEN (result).std_err
ELSE NULL::double precision[] END) as std_err,
(CASE WHEN (result).status = 1 THEN (result).z_stats
ELSE NULL::double precision[] END) as z_stats,
(CASE WHEN (result).status = 1 THEN (result).p_values
ELSE NULL::double precision[] END) as p_values,
(CASE WHEN (result).status = 1 THEN (result).odds_ratios
ELSE NULL::double precision[] END) as odds_ratios,
(CASE WHEN (result).status = 1 THEN (result).condition_no
ELSE NULL::double precision END) as condition_no,
(CASE WHEN (result).status = 1 THEN (result).num_processed
when result is NULL THEN 0
ELSE NULL::bigint end) AS num_rows_processed,
(CASE WHEN (result).status = 1 THEN num_rows - (result).num_processed
when result is null THEN num_rows
ELSE NULL::bigint end) AS num_missing_rows_skipped,
{col_grp_iteration} as num_iterations,
(CASE WHEN (result).status = 1 THEN (result).vcov
ELSE NULL::double precision[] END) as variance_covariance
FROM
(
SELECT {col_grp_iteration}, {grouping_str1} result, num_rows
FROM (
(SELECT
{grouping_str1}
{schema_madlib}.{fnName}({col_grp_state}) as result,
{col_grp_iteration}
FROM
{tbl_logregr_state}
) t
JOIN
(SELECT
{grouping_str1}
max({col_grp_iteration}) as {col_grp_iteration}
FROM {tbl_logregr_state}
GROUP BY {grouping_str2}
) s
USING ({grouping_str1} {col_grp_iteration})
) q1
{join_str}
(
SELECT
{grouping_str1}
count(*) num_rows
FROM {tbl_source}
GROUP BY {grouping_str2}
) q2
{using_str}
) q3
""".format(grouping_str1=grouping_str1,
grouping_str2=grouping_str2,
fnName=args[args["optimizer"]],
iteration_run=iteration_run,
using_str=using_str,
join_str=join_str,
**args))
failed_groups = plpy.execute("""
SELECT count(*) as count
FROM {tbl_output}
WHERE coef IS NULL
""".format(tbl_output=tbl_output))[0]["count"]
all_groups = plpy.execute("""
SELECT count(*) AS count
FROM {tbl_output}
""".format(**args))[0]["count"]
# JIRA: MADLIB-1172: When no model can be generated due to ill-conditioned
# input data, the output table dosen't get populated. In That
# case, report back an error instead of a successful empty table.
if failed_groups == all_groups or all_groups == 0:
plpy.execute("DROP TABLE IF EXISTS " + str(tbl_output))
plpy.error("Logregr error: No model created possibly "
"due to ill-conditioned data.")
num_rows = plpy.execute("""
SELECT
sum(num_rows_processed) AS num_rows_processed,
sum(num_missing_rows_skipped) AS num_missing_rows_skipped
FROM {tbl_output}
""".format(tbl_output=tbl_output))[0]
if num_rows['num_rows_processed'] is None:
num_rows['num_rows_processed'] = "NULL"
num_rows['num_missing_rows_skipped'] = "NULL"
args.update(num_rows)
tbl_output_summary = add_postfix(tbl_output, "_summary")
plpy.execute(
"""
create table {tbl_output_summary} as
select
'logregr'::varchar as method,
'{tbl_source}'::varchar as source_table,
'{tbl_output}'::varchar as out_table,
'{dep_col}'::varchar as dependent_varname,
'{ind_col}'::varchar as independent_varname,
'optimizer={optimizer}, max_iter={max_iter}, tolerance={tolerance}'::varchar as optimizer_params,
{all_groups}::integer as num_all_groups,
{failed_groups}::integer as num_failed_groups,
{num_rows_processed}::integer as num_rows_processed,
{num_missing_rows_skipped}::integer as num_missing_rows_skipped,
{grouping_col}::text as grouping_col
""".format(all_groups=all_groups,
failed_groups=failed_groups, tbl_output_summary=tbl_output_summary,
grouping_col="'" + grouping_col + "'" if grouping_col else "NULL",
**args))
plpy.execute("""
DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
DROP TABLE IF EXISTS pg_temp.{tbl_logregr_state}
""".format(**args))
plpy.execute("set client_min_messages to " + old_msg_level)
return None
# --------------------------------------------------------------------
def logregr_help_msg(schema_madlib, message, **kwargs):
""" Help message for logistic regression
@param message A string, the help message indicator
Returns:
A string, contains the help message
"""
if not message:
help_string = """
----------------------------------------------------------------
SUMMARY
----------------------------------------------------------------
Binomial logistic regression models the relationship between a
dichotomous dependent variable and one or more predictor variables.
The dependent variable may be a Boolean value or a categorical variable
that can be represented with a Boolean expression.
For more details on function usage:
SELECT {schema_madlib}.logregr_train('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
------------------------------------------------------------------
USAGE
------------------------------------------------------------------
SELECT {schema_madlib}.logregr_train(
source_table, -- name of input table
out_table, -- name of output table
dependent_varname, -- name of dependent variable
independent_varname, -- names of independent variables
grouping_cols, -- optional, default NULL, names of columns to group-by
max_iter, -- optional, default 20, maximum iteration number
optimizer, -- optional, default 'irls', name of optimization method
tolerance, -- optional, default 0.0001, the stopping threshold
verbose -- optional, default FALSE, whether to print useful info
);
------------------------------------------------------------------
OUTPUT
------------------------------------------------------------------
The output table ('out_table' above) has the following columns:
<...>, -- Grouping column values used during training
'coef', double precision[], -- vector of fitting coefficients
'log_likelihood', double precision, -- log likelihood
'std_err', double precision[], -- vector of standard errors of the fitting coefficients
'z_stats', double precision[], -- vector of the z-statistics of the coefficients
'p_values', double precision[], -- vector of the p values
'odds_ratios', double precision[], -- vector of odds ratios, exp(coefficients)
'condition_no', double precision, -- the condition number
'num_rows_processed', integer, -- how many rows are actually used in the computation
'num_missing_rows_skipped', integer, -- number of rows that contain NULL and were skipped per group
'num_iterations' double precision -- how many iterations are used in the computation per group
A summary table named <out_table>_summary is also created at the same time, which has:
'method' varchar, -- modeling method name ('logregr')
'source_table' varchar, -- the data source table name
'out_table' varchar, -- the output table name
'dependent_varname' varchar, -- the dependent variable
'independent_varname' varchar, -- the independent variable
'optimizer_params' varchar, -- 'optimizer=..., max_iter=..., tolerance=...'
'num_all_groups' integer, -- how many groups
'num_failed_groups' integer, -- how many groups' fitting processes failed
'num_rows_processed' integer, -- total number of rows used in the computation
'num_missing_rows_skipped' integer, -- total number of rows skipped
'grouping_col' varchar -- grouping columns used in the regression
"""
else:
help_string = "No such option. Use {schema_madlib}.logregr_train('help')"
return help_string.format(schema_madlib=schema_madlib)
# ========================================================================
def logregr_predict_help(schema_madlib, message, **kwargs):
""" Help message for logistic regression prediction
@param message A string, the help message indicator
Returns:
A string, contains the help message
"""
if not message:
message = "summary"
elif message.lower() in ['usage', 'help', '?']:
message = 'usage'
elif message.lower() in ['example', 'examples']:
message = 'example'
else:
message = 'unknown'
help_string = defaultdict(str)
help_string['summary'] = """
----------------------------------------------------------------
SUMMARY
----------------------------------------------------------------
Prediction for logistic regression can be used to obtain a prediction of the
boolean value of the dependent variable given a value of independent variable.
For more details on function usage:
SELECT {schema_madlib}.logregr_predict('usage')
For a small example on using the function:
SELECT {schema_madlib}.logregr_predict('example')
"""
help_string['usage'] = """
------------------------------------------------------------------
USAGE
------------------------------------------------------------------
logregr_predict(
coef, -- DOUBLE PRECISION[], Coefficient of logistic regression model
col_ind_var -- DOUBLE PRECISION[], Values for the independent variables
)
The lengths of 'coef' array and 'col_ind_var' array above should be equal. For
a small example on using the function:
SELECT {schema_madlib}.logregr_predict('example')
------------------------------------------------------------------
OUTPUT
------------------------------------------------------------------
The output of the function is a BOOLEAN value representing the predicted
dependent variable value for the inputed independent variables.
"""
help_string['example'] = """
The tables below are obtained from the example in 'logregr_train'.
Details can be found by running "SELECT logregr_train('examples');"
-- We compare the prediction with the actual value 'second_attack'
SELECT id,
madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]) as pred_value,
second_attack as orig_value
FROM patients p, patients_logregr m
ORDER BY id;
"""
help_string['unknown'] = "No such option. Use {schema_madlib}.logregr_predict()"
return help_string[message].format(schema_madlib=schema_madlib)
# -------------------------------------------------------------------------
def logregr_predict_prob_help(schema_madlib, message, **kwargs):
""" Help message for logistic regression probability prediction
@param message A string, the help message indicator
Returns:
A string, contains the help message
"""
if not message or message == 'summary':
message = "summary"
elif message.lower() in ['usage', 'help', '?']:
message = 'usage'
elif message.lower() in ['example', 'examples']:
message = 'example'
else:
message = 'unknown'
help_string = defaultdict(str)
help_string['summary'] = """
----------------------------------------------------------------
SUMMARY
----------------------------------------------------------------
This function can be used to obtain a prediction of the probability of
the dependent variable in a logistic regression model given the value of
independent variables.
For more details on function usage:
SELECT {schema_madlib}.logregr_predict_prob('usage')
For a small example on using the function:
SELECT {schema_madlib}.logregr_predict_prob('example')
"""
help_string['usage'] = """
------------------------------------------------------------------
USAGE
------------------------------------------------------------------
logregr_predict_prob(
coef, -- DOUBLE PRECISION[], Coefficient of logistic regression model
col_ind_var -- DOUBLE PRECISION[], Values for the independent variables
)
The lengths of 'coef' array and 'col_ind_var' array above should be equal. For
a small example on using the function:
SELECT {schema_madlib}.logregr_predict_prob('example')
------------------------------------------------------------------
OUTPUT
------------------------------------------------------------------
The output of the function is a DOUBLE PRECISION value representing the
probability of predicted dependent variable value being TRUE for the inputed
independent variable values.
"""
help_string['example'] = """
-- The tables below are obtained from the example in 'logregr_train'.
-- Details can be found by running "SELECT logregr_train('examples');"
SELECT id,
madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]) as pred_prob
FROM patients p, patients_logregr m
ORDER BY id;
"""
help_string['unknown'] = "No such option. Use {schema_madlib}.logregr_predict_prob()"
return help_string[message].format(schema_madlib=schema_madlib)