blob: 474bc20e3ec9fd1e77863a4d3605d2556575e7a3 [file] [log] [blame]
# coding=utf-8
"""
@file marginal.py_in
@brief Marginal Effects: Common functions
@namespace marginal
"""
import plpy
from utilities.utilities import unique_string
from utilities.utilities import _string_to_array
from utilities.utilities import extract_keyvalue_params
from utilities.utilities import _assert
from utilities.utilities import py_list_to_sql_string
from utilities.validate_args import table_exists
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_is_empty
# use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
from utilities.utilities import __mad_version
version_wrapper = __mad_version()
string_to_array = version_wrapper.select_vecfunc()
array_to_string = version_wrapper.select_vec_return()
# ========================================================================
def _margins_common_validate_args(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, marginal_vars,
max_iter, tolerance, **kwargs):
_assert(source_table and
source_table.strip().lower() not in ('null', ''),
"Margins error: Invalid data table name!")
_assert(table_exists(source_table),
"Margins error: Data table does not exist!")
_assert(not table_is_empty(source_table),
"Margins error: Data table is empty!")
_assert(out_table and
out_table.strip().lower() not in ('null', ''),
"Margins error: Invalid output table name!")
_assert(not table_exists(out_table, only_first_schema=True),
"Margins error: Output table already exists!")
_assert(not table_exists(out_table + '_summary', only_first_schema=True),
"Margins error: Output summary table already exists!")
_assert(dependent_varname and
dependent_varname.strip().lower() not in ('null', ''),
"Margins error: Invalid dependent column name!")
_assert(independent_varname and
independent_varname.strip().lower() not in ('null', ''),
"Margins error: Invalid independent column name!")
if grouping_cols:
_assert(grouping_cols.strip().lower() not in ('null', ''),
"Margins error: Invalid grouping columns name!")
_assert(columns_exist_in_table(
source_table, _string_to_array(grouping_cols), schema_madlib),
"Margins error: Grouping column does not exist!")
if marginal_vars:
vec = string_to_array(marginal_vars, text=False)
_assert(min(vec) is not None,
"Margins error: NULL value found in marginal_vars!")
lower_dim = plpy.execute("select array_lower({0},1) as d from {1}".format(
independent_varname, source_table))[0]['d']
upper_dim = plpy.execute("select array_upper({0},1) as d from {1}".format(
independent_varname, source_table))[0]['d']
n = upper_dim - lower_dim + 1
_assert(min(vec) >= 1,
"Margins error: All indices in the marginal coefficients "
"array must be >= 1.")
_assert(max(vec) <= n,
"Margins error: All indices in the marginal coefficients "
"array must be <= the number of independent variables.")
_assert(max_iter is not None,
"Margins error: The max_iter should not be NULL!")
_assert(max_iter > 0,
"Margins error: Maximum number of iterations must be positive!")
_assert(tolerance is not None,
"Margins error: The tolerance should not be NULL!")
_assert(tolerance >= 0,
"Margins error: The tolerance cannot be negative!")
# -------------------------------------------------------------------------
def _margins_logregr_validate_args(optimizer):
_assert(optimizer is not None,
"Margins error: Optimizer should not be NULL")
_assert(optimizer in ("irls", "cg", "igd"),
"Margins error: Optimizer does not exist. Must be 'newton'/'irls', 'cg', or 'igd'.")
# Main function call for marginal logisitc regression
def margins_logregr(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, marginal_vars, max_iter,
optimizer, tolerance, verbose_mode, **kwargs):
"""
@brief A wrapper function for the various marginal regression analyzes.
@param source_table String identifying the input table
@param out_table String identifying the output table to be created
@param dependent_varname Column containing the dependent variable
@param independent_varname Column containing the array of independent variables
@param grouping_cols Set of columns to group by.
@param marginal_vars Subset of independent variables to calculate marginal effects for.
@param max_iter Maximum number of iterations
@param optimzer Optimizer to be used (newton/irls, cg or idg)
@param tolerance Resiual tolerance
@par
To include an intercept in the model, set one coordinate in the
<tt>independentVariables</tt> array to 1.
@return void
@usage
For function summary information. Run
sql> select margins_logregr('help');
OR
sql> select margins_logregr();
OR
sql> select margins_logregr('?');
For function usage information. Run
sql> select margins_logregr('usage');
"""
plpy.warning("This function has been deprecated and replaced by 'margins'")
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
plpy.execute('SET client_min_messages TO warning')
if optimizer is not None and optimizer.lower() == 'newton':
optimizer = 'irls'
# Validate arguments
_margins_common_validate_args(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, marginal_vars, max_iter,
tolerance)
_margins_logregr_validate_args(optimizer)
group_col_str = 'NULL' if grouping_cols is None else "'" + grouping_cols + "'"
optimizer_str = 'NULL' if optimizer is None else "'" + optimizer + "'"
maxiter_str = 'NULL' if max_iter is None else max_iter
tolerance_str = 'NULL' if tolerance is None else tolerance
logr_out_table = "pg_temp." + unique_string()
# No grouping
if not grouping_cols:
# Run logistic regression
plpy.execute("""
SELECT {schema_madlib}.logregr_train(
'{source_table}', '{logr_out_table}',
'{dependent_varname}', '{independent_varname}', {group_col_str},
{maxiter_str}, {optimizer_str}, {tolerance_str}, {verbose})
""".format(schema_madlib=schema_madlib, source_table=source_table,
logr_out_table=logr_out_table, dependent_varname=dependent_varname,
independent_varname=independent_varname, group_col_str=group_col_str,
maxiter_str=maxiter_str, optimizer_str=optimizer_str,
tolerance_str=tolerance_str, verbose=verbose_mode))
# Rename the output summary table
plpy.execute("""CREATE TABLE {out_table}_summary AS
SELECT * FROM {logr_out_table}_summary""".format(**locals()))
plpy.execute("""UPDATE {out_table}_summary SET out_table = '{out_table}'
""".format(out_table=out_table))
coef = plpy.execute("select coef from {0}".format(logr_out_table))[0]['coef']
if coef is None:
plpy.error("Margins error: No fitting coefficients were computed!")
num_depvars = plpy.execute("""
SELECT array_upper(coef, 1) num_depvars from {0}
""".format(logr_out_table))[0]['num_depvars']
if num_depvars is not None:
# If marginal variables are none. Then chose all variables
if not marginal_vars:
marginal_vars = range(1, num_depvars + 1)
else:
marginal_vars = map(int, string_to_array(marginal_vars))
index_array = py_list_to_sql_string(marginal_vars, array_type="integer")
else:
index_array = 'NULL'
# Run marginal effects for logistic regression
plpy.execute("""
CREATE TABLE {out_table} AS
SELECT
{schema_madlib}.__sub_array((res).margins, {index_array}) AS margins,
{schema_madlib}.__sub_array((res).std_err, {index_array}) AS std_err,
{schema_madlib}.__sub_array((res).z_stats, {index_array}) AS z_stats,
{schema_madlib}.__sub_array((res).p_values, {index_array}) AS p_values
FROM
(
SELECT
{schema_madlib}.marginal_logregr(
({dependent_varname})::BOOLEAN,
{independent_varname},
(SELECT coef FROM {logr_out_table})) AS res
FROM
{source_table}
) t1
""".format(schema_madlib=schema_madlib,
source_table=source_table, out_table=out_table,
dependent_varname=dependent_varname,
independent_varname=independent_varname,
logr_out_table=logr_out_table, index_array=index_array))
# Drop the output table of logregr_train()
plpy.execute('DROP TABLE IF EXISTS ' + logr_out_table)
plpy.execute("SET client_min_messages TO %s" % old_msg_level)
# -------------------------------------------------------------------------
def margins_logregr_help(schema_madlib, message, **kwargs):
plpy.warning("This function has been deprecated and replaced by 'margins'")
if not message:
help_string = """
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Functionality: Calculate marginal effects for logistic regression
For more details on function usage:
SELECT {schema_madlib}.margins_logregr('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.margins_logregr(
'source_table', -- Name of data table
'output_table', -- Name of result table
'dependent_variable', -- Name of column for dependent variables
'independent_variable', -- Name of column for independent variables
'grouping_cols', -- An expression list used to group the input dataset into discrete groups.
-- (Optional, DEFAUT=NULL.
-- Not currently implemented. Any non-NULL value is ignored.)
marginal_vars, -- Indices of variables to calculate marginal effects on
-- (Optional, DEFAULT: All independent variables)
'max_iter', -- [OPTIONAL] The number of iterations used by the logistic regression solver. Default is 20.
'optimizer', -- [OPTIONAL] Name of the optimizer used in the logistic regression. Default is irls.
'tolerance', -- [OPTIONAL] The tolerance of the logistic regression optimizer. Default is 0.0001.
'verbose_mode' -- [OPTIONAL] Should the optimizer print warning messages to the screen. Default is FALSE.
);
-----------------------------------------------------------------------
OUTUPT
-----------------------------------------------------------------------
The output table ('output_table' above) has the following columns
margins DOUBLE PRECISION[], -- Marginal effects
std_err DOUBLE PRECISION[], -- Standard errors using delta method
z_stats DOUBLE PRECISION[], -- z-stats of the standard errors
p_values DOUBLE PRECISION[], -- p-values of the standard errors
The output summary table is the same as logregr_train(), see also:
SELECT logregr_train('usage');
"""
else:
help_string = "No such option. Use {schema_madlib}.margins_mlogregr()"
return help_string.format(schema_madlib=schema_madlib)
# ========================================================================
# -----------------------------------------------------------------------
# Marginal Effects for multinomial logistic regression
# -----------------------------------------------------------------------
# ========================================================================
# Input handling for multinomial logistic regression
def _margins_mlogregr_validate_args(schema_madlib, source_table,
dependent_varname, independent_varname,
ref_category, optimizer, **kwargs):
"""
Validate the arguments
"""
_assert(ref_category is not None,
"Margins: Reference category cannot be null!")
_assert(ref_category >= 0,
"Margins error: Reference category cannot be negative!")
_assert(optimizer is not None,
"Margins error: Optimizer cannot be NULL!")
_assert(optimizer.lower() in ("irls"),
"Margins error: Optimizer does not exist. Must be 'newton'/'irls'.")
result_w_null = plpy.execute("""
SELECT DISTINCT {dep} AS cat
FROM {source}
WHERE {dep} is not NULL
""".format(source=source_table,
indep=independent_varname,
dep=dependent_varname))
result_wo_null = plpy.execute("""
SELECT DISTINCT {dep} AS cat
FROM {source}
WHERE {dep} is not NULL
AND NOT {madlib}.array_contains_null({indep})
""".format(madlib=schema_madlib, source=source_table,
indep=independent_varname,
dep=dependent_varname))
categories_wo_null = set(i["cat"] for i in result_wo_null)
categories_w_null = set(i["cat"] for i in result_w_null)
_assert(categories_wo_null == categories_w_null,
"Margins error: All observations of category set {0} contain "
"NULL values. These rows should be removed from the dataset "
"before proceeding.".
format(list(categories_w_null - categories_wo_null)))
# ========================================================================
# Main function call for marginal multinomial logistic regression
def margins_mlogregr_main(schema_madlib, source_table,
out_table, dependent_varname,
independent_varname, ref_category,
marginal_vars, grouping_cols,
optimizer_params, regr_coef=None, **kwargs):
"""
@brief A wrapper function for the marginal_logregr.
@param source_table string, name of the input table
@param out_table string, name of the output table to be created
@param dependent_varname: string, Column containing the dependent variable
@param independent_varname string, Column containing the array of independent variables
@param ref_category int, Reference category for multinomial logistic regression
@param grouping_cols string, Set of columns to group by.
@param marginal_vars string, Subset of independent variables to calculate marginal effects for.
@param optimizer_params: string, Comma-separated string of optimizer parameters
Supported parameters:
max_iter Maximum number of iterations
optimzer Optimizer to be used (newton/irls, cg or idg)
tolerance Resiual tolerance
To include an intercept in the model, set one coordinate in the
<tt>independentVariables</tt> array to 1.
Returns:
None
"""
allowed_param_types = {'max_iter': int, 'max_num_iterations': int,
'optimizer': str,
'tolerance': float}
default_optimizer_values = {'max_iter': 20,
'optimizer': 'irls',
'tolerance': 0.0001}
optimizer_param_dict = extract_keyvalue_params(optimizer_params,
allowed_param_types,
default_optimizer_values)
optimizer_param_dict.update(kwargs)
margins_mlogregr(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
ref_category, marginal_vars,
grouping_cols=grouping_cols,
regr_coef=regr_coef,
**optimizer_param_dict)
# ========================================================================
def margins_mlogregr_new(schema_madlib,
model_table,
out_table,
marginal_vars=None,
**kwargs):
"""
Updated function interface to Margins for multinomial logistic regression.
Input to this function is the model table obtained from mlogregr_train. All
necessary parameters are obtained from the model table.
Args:
@param model_table: string, Name of the table returned by mlogregr_train
@param out_table: string, name of the output table to be created
@param marginal_vars: list, Integer array of indices of
"""
_assert(model_table is not None and
model_table.strip().lower() not in ('null', ''),
"Margins error: Invalid model table name")
_assert(table_exists(model_table),
"Margins error: Model table {0} does not exist".
format(model_table))
_assert(table_exists(model_table + "_summary"),
"Margins error: Model Summary table {0} does not exist".
format(model_table + "_summary"))
_assert(out_table is not None and
out_table.strip().lower() not in ('null', ''),
"Margins error: Invalid output table name")
_assert(not table_exists(out_table, only_first_schema=True),
"Margins error: Output table {0}"
" already exists".format(str(out_table)))
_assert(columns_exist_in_table(model_table,
['coef']),
"Margins error: Invalid model data table"
" - column coef missing")
_assert(columns_exist_in_table(model_table + '_summary',
['source_table', 'dependent_varname', 'independent_varname',
'optimizer_params', 'ref_category']),
"Margins error: Invalid model summary table"
" - some required columns missing")
# info is a dict that contains source_table, ind_var, dep_var,
info = plpy.execute("SELECT * FROM {0}_summary".format(model_table))[0]
coef = string_to_array(plpy.execute("SELECT coef from {0}".
format(model_table))[0]['coef'])
info['out_table'] = out_table
margins_mlogregr_main(schema_madlib=schema_madlib, regr_coef=coef,
marginal_vars=marginal_vars,
grouping_cols=None, **info)
# ========================================================================
def margins_mlogregr(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
ref_category, marginal_vars,
max_iter, optimizer, tolerance,
grouping_cols=None, regr_coef=None,
**kwargs):
"""
@brief A wrapper function for the various marginal regression analyzes.
@param source_table String identifying the input table
@param out_table String identifying the output table to be created
@param dependent_varname Column containing the dependent variable
@param independent_varname Column containing the array of independent variables
@param ref_category Reference category for multinomial logistic regression
@param grouping_cols Set of columns to group by.
@param marginal_vars Subset of indices of independent variables to calculate marginal effects for.
@param max_iter Maximum number of iterations
@param optimzer Optimizer to be used (newton/irls, cg or idg)
@param tolerance Resiual tolerance
@par
To include an intercept in the model, set one coordinate in the
<tt>independentVariables</tt> array to 1.
@return void
@usage
For function summary information. Run
sql> select margins_mlogregr('help');
OR
sql> select margins_mlogregr();
OR
sql> select margins_mlogregr('?');
"""
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
if 'verbose_mode' in kwargs and kwargs['verbose_mode']:
plpy.execute('SET client_min_messages TO warning')
else:
plpy.execute('SET client_min_messages TO error')
if optimizer.lower() == 'newton':
optimizer = 'irls'
# Validate arguments
_margins_common_validate_args(schema_madlib, source_table, out_table,
dependent_varname, independent_varname,
grouping_cols, marginal_vars,
max_iter, tolerance)
all_arguments = {'schema_madlib': schema_madlib,
'source_table': source_table,
'out_table': out_table,
'dependent_varname': dependent_varname,
'independent_varname': independent_varname,
'ref_category': ref_category,
'grouping_cols': grouping_cols,
'marginal_vars': marginal_vars,
'max_iter': max_iter,
'optimizer': optimizer,
'tolerance': tolerance}
_margins_mlogregr_validate_args(**all_arguments)
# NOTICE: * support was removed because other modules did not have it.
# Uncomment the following code if you want to re-add '*' support
# Check for '*' in indepdendent variables
#if independent_varname == "*":
# all_arguments['independent_varname'] = \
# _internal_return_all_except_dep_vars(schema_madlib,
# source_table,
# dependent_varname
# )
# No grouping
if not grouping_cols:
mlog_out_table = "pg_temp." + unique_string()
all_arguments['mlog_out_table'] = mlog_out_table
# Run regression
plpy.execute("""
SELECT {schema_madlib}.mlogregr_train(
'{source_table}', '{mlog_out_table}',
'{dependent_varname}', '{independent_varname}', {ref_category},
'max_iter={max_iter}, optimizer={optimizer}, tolerance={tolerance}')
""".format(**all_arguments))
# Rename the output summary table
plpy.execute("""CREATE TABLE {out_table}_summary AS
SELECT * FROM {mlog_out_table}_summary""".format(**locals()))
plpy.execute("UPDATE {out_table}_summary SET out_table = '{out_table}'".
format(out_table=out_table))
num_categories = plpy.execute(
"SELECT count(DISTINCT {0}) as n_cat FROM {1}".
format(dependent_varname, source_table))[0]['n_cat']
all_arguments['num_categories'] = num_categories
num_indepvars = plpy.execute("""
SELECT array_upper(coef, 1) AS num_indepvars FROM {mlog_out_table}
""".format(mlog_out_table=mlog_out_table))[0]['num_indepvars']
if num_indepvars is not None:
# Note: The marginal_vars is a base 1 array
# If marginal variables are none. Then chose all variables
if not marginal_vars:
num_features = num_indepvars
marginal_vars = range(1, num_indepvars * (num_categories - 1) + 1)
else:
# For each independent var, get all the indices
num_features = len(marginal_vars)
marginal_vars_only = map(int, string_to_array(marginal_vars))
marginal_vars = []
for j in range(num_categories - 1):
for m in marginal_vars_only:
marginal_vars.append((m - 1) * (num_categories - 1) + j + 1)
all_arguments['index_array'] = py_list_to_sql_string(marginal_vars,
"integer")
all_arguments['num_features'] = num_features
else:
all_arguments['index_array'] = 'NULL'
all_arguments['num_features'] = 'NULL'
# Run Robust Variance
plpy.execute("""
CREATE TABLE {out_table} AS
SELECT
({schema_madlib}.__mlogregr_format(
margins, {num_features},
{num_categories}, {ref_category})
).category AS category,
{ref_category} as ref_category,
({schema_madlib}.__mlogregr_format(
margins, {num_features},
{num_categories}, {ref_category})
).coef AS margins,
({schema_madlib}.__mlogregr_format(
std_err, {num_features},
{num_categories}, {ref_category})
).coef AS std_err,
({schema_madlib}.__mlogregr_format(
z_stats, {num_features},
{num_categories}, {ref_category})
).coef AS z_stats,
({schema_madlib}.__mlogregr_format(
p_values, {num_features},
{num_categories}, {ref_category})
).coef AS p_values
FROM
(
SELECT
{schema_madlib}.__sub_array((res).margins, {index_array}) AS margins,
{schema_madlib}.__sub_array((res).std_err, {index_array}) AS std_err,
{schema_madlib}.__sub_array((res).z_stats, {index_array}) AS z_stats,
{schema_madlib}.__sub_array((res).p_values, {index_array}) AS p_values
FROM
(
SELECT
{schema_madlib}.marginal_mlogregr(
({dependent_varname})::INTEGER,
{num_categories},
{ref_category},
{independent_varname},
(SELECT
{schema_madlib}.matrix_agg(coef ORDER BY category)
FROM {mlog_out_table})
) AS res
FROM {source_table}
) t1
) t2
""".format(**all_arguments))
# Drop the output table of mlogregr_train()
plpy.execute("DROP TABLE IF EXISTS " + mlog_out_table)
plpy.execute("SET client_min_messages TO %s" % old_msg_level)
# ========================================================================
def margins_mlogregr_help(schema_madlib, message, **kwargs):
"""
Help function for marginal_mlogregr
Args:
@param schema_madlib
@param message: string, Help message string
@param kwargs
Returns:
String. Help/usage information
"""
if not message:
help_string = """
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Functionality: Calculate marginal effects for multinomial logistic regression
For more details on function usage:
SELECT {schema_madlib}.margins_mlogregr('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.margins_mlogregr(
'source_table', -- Name of data table
'output_table', -- Name of result table
'dependent_variable', -- Name of column for dependent variables
'independent_variable', -- Name of column for independent variables
ref_category, -- Reference category for the multinomial logistic regression
-- (Optional, DEFAULT=0)
'grouping_cols', -- An expression list used to group the input dataset into discrete groups.
-- (Optional, DEFAUT=NULL.
-- Not currently implemented. Any non-NULL value is ignored.)
marginal_vars, -- Indices of variables to calculate marginal effects on
-- (Optional, DEFAULT: All independent variables)
'optimizer_params', -- The optimizer parameters as a comma-separated string
-- (Optional, DEFAULT: max_iter=20, optimizer=irls, tolerance=0.0001)
verbose_mode -- When TRUE, provides verbose output of the results of training.
(Optional, DEFAUT=FALSE)
);
-----------------------------------------------------------------------
OUTUPT
-----------------------------------------------------------------------
The output table ('output_table' above) has the following columns
margins DOUBLE PRECISION[], -- Marginal effects
std_err DOUBLE PRECISION[], -- Standard errors using delta method
z_stats DOUBLE PRECISION[], -- z-stats of the standard errors
p_values DOUBLE PRECISION[], -- p-values of the standard errors
The output summary table is the same as mlogregr_train(), see also:
SELECT mlogregr_train('usage');
"""
else:
help_string = "No such option. Use {schema_madlib}.margins_mlogregr()"
return help_string.format(schema_madlib=schema_madlib)