blob: ffabd3761930bbf053c72be4f95cd85b3977f73e [file] [log] [blame]
# coding=utf-8
"""
@file robust_mlogistic.py_in
@namespace robust
@brief Robust variance: Common functions
"""
import plpy
from utilities.utilities import unique_string
from utilities.utilities import extract_keyvalue_params
from utilities.utilities import _assert
from utilities.utilities import add_postfix
from utilities.validate_args import table_exists
from utilities.utilities import rename_table
from utilities.validate_args import columns_exist_in_table
from regress.robust_linear import _robust_linregr_validate
# use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
from utilities.utilities import __mad_version
version_wrapper = __mad_version()
string_to_array = version_wrapper.select_vecfunc()
array_to_string = version_wrapper.select_vec_return()
def _robust_mlogregr_validate(schema_madlib, source_table,
dependent_varname, independent_varname,
ref_category, max_iter, optimizer,
tolerance, **kwargs):
_assert(ref_category is not None,
"Robust Variance: Reference category cannot be null!")
_assert(ref_category >= 0,
"Robust Variance error: Reference category cannot be negative!")
_assert(max_iter > 0,
"Robust Variance error: Maximum number of iterations must be positive!")
_assert(tolerance >= 0,
"Robust Variance error: The tolerance cannot be negative!")
_assert(optimizer.lower() in ("irls", "newton"),
"Robust Variance error: Optimizer does not exist. Must be 'newton'/'irls'.")
result_w_null = plpy.execute("""
SELECT DISTINCT {dep} AS cat
FROM {source}
WHERE {dep} is not NULL
""".format(source=source_table,
indep=independent_varname,
dep=dependent_varname))
result_wo_null = plpy.execute("""
SELECT DISTINCT {dep} AS cat
FROM {source}
WHERE {dep} is not NULL
AND NOT {madlib}.array_contains_null({indep})
""".format(madlib=schema_madlib, source=source_table,
indep=independent_varname,
dep=dependent_varname))
categories_wo_null = set(i["cat"] for i in result_wo_null)
categories_w_null = set(i["cat"] for i in result_w_null)
_assert(categories_wo_null == categories_w_null,
"Robust Variance error: All observations of category set {0} contain "
"NULL values. These rows should be removed from the dataset "
"before proceeding.".
format(list(categories_w_null - categories_wo_null)))
# -------------------------------------------------------------------------
# Main function call for robust variance multinomial logistic regression
def robust_variance_mlogregr(
schema_madlib, source_table,
out_table, dependent_varname,
independent_varname, ref_category,
grouping_cols=None, optimizer_params=None, regr_coef=None,
verbose_mode=False, **kwargs):
"""
@brief A wrapper function for the robust_variance_mlogregr.
@param source_table string, name of the input table
@param out_table string, name of the output table to be created
@param dependent_varname: string, Column containing the dependent variable
@param independent_varname string, Column containing the array of independent variables
@param ref_category int, Reference category for multinomial logistic regression
@param grouping_cols string, Set of columns to group by.
@param optimizer_params: string, Comma-separated string of optimizer parameters
Supported parameters:
max_iter: Maximum number of iterations (Default = 20)
optimzer: Optimizer to be used (newton/irls, Default = irls)
tolerance: Residual tolerance (Default = 0.0001)
To include an intercept in the model, set one coordinate in the
<tt>independentVariables</tt> array to 1.
Returns:
None
"""
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
if verbose_mode:
plpy.execute('SET client_min_messages TO warning')
else:
plpy.execute('SET client_min_messages TO error')
#extract optimizer parameters
allowed_param_types = {'max_iter': int, 'max_num_iterations': int,
'optimizer': str,
'tolerance': float, 'precision': float}
default_optimizer_values = {'max_iter': 20,
'optimizer': 'irls',
'tolerance': 0.0001}
optimizer_param_dict = extract_keyvalue_params(optimizer_params,
allowed_param_types,
default_optimizer_values)
if not optimizer_param_dict:
optimizer_param_dict = default_optimizer_values
if 'precision' in optimizer_param_dict and \
'tolerance' not in optimizer_param_dict:
optimizer_param_dict['tolerance'] = optimizer_param_dict['precision']
if 'max_num_iterations' in optimizer_param_dict and \
'max_iter' not in optimizer_param_dict:
optimizer_param_dict['max_iter'] = optimizer_param_dict['max_num_iterations']
if optimizer_param_dict['optimizer'].lower() == 'newton':
optimizer_param_dict['optimizer'] = 'irls'
# Validate arguments
_robust_linregr_validate(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, verbose_mode)
all_arguments = {'schema_madlib': schema_madlib,
'source_table': source_table,
'output_table': out_table,
'dependent_varname': dependent_varname,
'independent_varname': independent_varname,
'ref_category': ref_category,
'grouping_cols': grouping_cols}
all_arguments.update(optimizer_param_dict)
_robust_mlogregr_validate(**all_arguments)
out_table_summary=add_postfix(out_table, "_summary")
if not grouping_cols:
mlog_out_table = "pg_temp." + unique_string()
mlog_out_table_summary=add_postfix(mlog_out_table, "_summary")
all_arguments['mlog_out_table'] = mlog_out_table
# Run regression
plpy.execute("""
SELECT {schema_madlib}.mlogregr_train(
'{source_table}', '{mlog_out_table}',
'{dependent_varname}', '{independent_varname}', {ref_category},
'max_iter={max_iter}, optimizer={optimizer}, tolerance={tolerance}')
""".format(**all_arguments))
plpy.execute("""
CREATE TABLE {out_table_summary} AS
SELECT
source_table,
'{out_table}'::TEXT AS out_table,
dependent_varname,
independent_varname,
optimizer_params,
ref_category,
num_rows_processed,
num_missing_rows_skipped
FROM
{mlog_out_table_summary}
""".format(out_table=out_table,
out_table_summary=out_table_summary,
mlog_out_table_summary=mlog_out_table_summary))
all_arguments['num_categories'] = plpy.execute(
"SELECT count(DISTINCT {0}) as n_cat FROM {1}".
format(dependent_varname, source_table))[0]['n_cat']
all_arguments['num_features'] = plpy.execute("""
SELECT
array_upper({independent_varname}, 1) fnum
FROM {source_table} LIMIT 1
""".format(**all_arguments))[0]['fnum']
# Run Robust Variance
plpy.execute("""
CREATE TABLE {output_table} AS
SELECT
({schema_madlib}.__mlogregr_format(
(result).coef, {num_features},
{num_categories}, {ref_category})
).category AS category,
{ref_category} as ref_category,
({schema_madlib}.__mlogregr_format(
(result).coef, {num_features},
{num_categories}, {ref_category})
).coef AS coef,
({schema_madlib}.__mlogregr_format(
(result).std_err, {num_features},
{num_categories}, {ref_category})
).coef AS std_err,
({schema_madlib}.__mlogregr_format(
(result).z_stats, {num_features},
{num_categories}, {ref_category})
).coef AS z_stats,
({schema_madlib}.__mlogregr_format(
(result).p_values, {num_features},
{num_categories}, {ref_category})
).coef AS p_values
FROM
(
SELECT
{schema_madlib}.robust_mlogregr(
({dependent_varname})::INTEGER,
{num_categories},
{ref_category},
{independent_varname},
(SELECT
{schema_madlib}.matrix_agg(coef ORDER BY category)
FROM {mlog_out_table})
) AS result
FROM {source_table}
) t1
""".format(**all_arguments))
# Remove the output table of mlogregr_train()
plpy.execute('DROP TABLE IF EXISTS ' + mlog_out_table)
plpy.execute("SET client_min_messages TO %s" % old_msg_level)
# ========================================================================
def robust_variance_mlogregr_new(schema_madlib,
model_table,
out_table,
**kwargs):
"""
Updated function interface to extract parameters from mlogregr_train outputs
Input to this function is the model table obtained from mlogregr_train. All
necessary parameters are obtained from the model table.
Args:
@param model_table: string, Name of the table returned by mlogregr_train
@param output_table: string, name of the output table to be created
"""
_assert(model_table is not None and
model_table.strip().lower() not in ('null', ''),
"Robust Variance error: Invalid model table name")
_assert(table_exists(model_table),
"Robust Variance error: Model table {0} does not exist".
format(model_table))
model_table_summary = add_postfix(model_table, "_summary")
_assert(table_exists(model_table_summary),
"Robust Variance error: Model Summary table {0} does not exist".
format(model_table_summary))
_assert(out_table is not None and
out_table.strip().lower() not in ('null', ''),
"Robust Variance error: Invalid output table name")
_assert(not table_exists(out_table, only_first_schema=True),
"Robust Variance error: Output table {0}"
" already exists".format(str(out_table)))
_assert(columns_exist_in_table(model_table,
['coef']),
"Robust Variance error: Invalid model data table"
" - column coef missing")
_assert(columns_exist_in_table(model_table_summary,
['source_table', 'dependent_varname', 'independent_varname',
'optimizer_params', 'ref_category']),
"Robust Variance error: Invalid model summary table"
" - required columns missing")
# info is a dict that contains source_table, ind_var, dep_var,
info = plpy.execute("SELECT * FROM {0}".format(model_table_summary))[0]
info['out_table'] = out_table
coef = string_to_array(plpy.execute("SELECT coef from {0}".
format(model_table))[0]['coef'])
robust_variance_mlogregr(schema_madlib=schema_madlib, regr_coef=coef,
verbose_mode=False, **info)
# -------------------------------------------------------------------------
def robust_variance_mlogregr_help(schema_madlib, message, **kwargs):
"""
Help function for robust_variance_mlogregr
Args:
@param schema_madlib
@param message: string, Help message string
@param kwargs
Returns:
String. Help/usage information
"""
if not message:
help_string = """
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Functionality: Calculate Huber-White robust statistics for multinomial logistic regression
For more details on function usage:
SELECT {schema_madlib}.robust_variance_mlogregr('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.robust_variance_mlogregr(
'source_table', -- Name of data table
'out_table', -- Name of result table
'dependent_varname', -- Name of column for dependent variables
'independent_varname', -- Name of column for independent variables
ref_category, -- Reference category for the multinomial logistic regression
-- (Optional, DEFAULT=0)
'grouping_cols', -- An expression list used to group the input dataset into discrete groups.
-- (Optional, DEFAUT=NULL.
-- Not currently implemented. Any non-NULL value is ignored.)
'optimizer_params', -- The optimizer parameters as a comma-separated string
-- (Optional, DEFAULT: max_iter=20, optimizer=irls, tolerance=0.0001)
verbose_mode -- When TRUE, provides verbose output of the results of training.
(Optional, DEFAUT=FALSE)
);
-----------------------------------------------------------------------
OUTUPT
-----------------------------------------------------------------------
The output table ('out_table' above) has the following columns
coef DOUBLE PRECISION[], -- Coefficients of mlogit regression
std_err DOUBLE PRECISION[], -- Huber-White standard errors
z_stats DOUBLE PRECISION[], -- Z-stats of the standard errors
p_values DOUBLE PRECISION[] -- p-values of the standard errors
The output summary table is the same as mlogregr_train(), see also:
SELECT mlogregr_train('usage');
"""
else:
help_string = "No such option. Use {schema_madlib}.robust_variance_mlogregr()"
return help_string.format(schema_madlib=schema_madlib)