blob: a89c86a4a5ed91c30b5afe230a12112be7aad074 [file] [log] [blame]
# ------------------------------------------------------------------------
# Compute clustered errors
# ------------------------------------------------------------------------
import plpy
from utilities.utilities import __unique_string
from utilities.utilities import _array_to_string
from utilities.utilities import _string_to_array
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_is_empty
from utilities.validate_args import scalar_col_has_no_null
from utilities.validate_args import table_exists
# ========================================================================
def __prepare_strings(clustervar, grouping_col):
"""
Prepare the needed strings
"""
fitres = __unique_string() # result table for linear regression
if grouping_col is None:
grouping_col_str = "NULL"
else:
grouping_col_str = "'" + grouping_col + "'"
coef_str = __unique_string()
if clustervar is None:
cluster_grouping_str = "group by {coef_str}".format(coef_str = coef_str)
else:
cluster_grouping_str = "group by {coef_str}, ".format(coef_str = coef_str) + clustervar
return (fitres, coef_str, cluster_grouping_str, grouping_col_str)
# ------------------------------------------------------------------------
def __generate_clustered_sql (**kwargs):
"""
Create the SQL query to execute and create the result table
"""
sqlPart1 = """
create table {tbl_output} as
select (f).* from (
select {schema_madlib}.__clustered_{regr_type}_compute_stats(
max(coef),
m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(meatvec),
m4_ifdef(`__POSTGRESQL__', `{schema_madlib}.__array_')sum(breadvec),
count(numRows)::integer, (sum(numRows))::integer) as f
from (
select (g).*, coef, numRows from (
select""".format(**kwargs)
if(kwargs['regr_type'] == 'mlog'): #We need to know the reference category and number of categories if doing the mlog regression
sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
{indvar}, {coef_str}, (select count(distinct {depvar}) from {tbl_data})::INTEGER, ({ref_category})::INTEGER ) as g,""".format(**kwargs)
else:
sqlPart2=""" {schema_madlib}.__clustered_err_{regr_type}_step({depvar},
{indvar}, {coef_str}) as g,""".format(**kwargs)
sqlPart3=""" {coef_str} as coef,
count({depvar}) as numRows
from (
select u.coef as {coef_str}, v.*
from
{fitres} u, {tbl_data} v
) s
{cluster_grouping_str}) t) p) q
""".format(**kwargs)
return sqlPart1 + sqlPart2 + sqlPart3
# ========================================================================
def clustered_variance_linregr (schema_madlib, tbl_data, tbl_output,
depvar, indvar, clustervar, grouping_col,
**kwargs):
"""
Linear regression clustered standard errors
"""
old_msg_level = plpy.execute("select setting from pg_settings where \
name='client_min_messages'")[0]['setting']
plpy.execute("set client_min_messages to error")
validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col)
(fitres, coef_str, cluster_grouping_str,
grouping_col_str) = __prepare_strings(clustervar, grouping_col)
plpy.execute(
"""
select {schema_madlib}.linregr_train('{tbl_data}',
'{fitres}', '{depvar}', '{indvar}', {grouping_col})
""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
fitres = fitres, depvar = depvar, indvar = indvar,
grouping_col = grouping_col_str))
plpy.execute(
__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
indvar = indvar, coef_str = coef_str,
tbl_data = tbl_data, fitres = fitres,
cluster_grouping_str = cluster_grouping_str,
tbl_output = tbl_output, regr_type = "lin"))
plpy.execute(
"""
drop table if exists {fitres}
""".format(fitres = fitres))
plpy.execute("set client_min_messages to " + old_msg_level)
return None
# ========================================================================
def validate_args_clustered_variance (schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col):
"""
Validate the parameters
"""
if not tbl_data or tbl_data in ('null', '') or not table_exists(tbl_data):
plpy.error("Clustered variance estimation error: Data table does not exist!")
if table_is_empty(tbl_data):
plpy.error("Clustered variance estimation error: Data table is empty!")
if tbl_output is None or tbl_output.lower() in ('null', ''):
plpy.error("Clustered variance estimation error: Invalid output table name!")
if table_exists(tbl_output):
plpy.error("Clustered variance estimation error: Output table exists!")
if depvar is None or (not isinstance(depvar, str)) or depvar.lower() in ('null', ''):
plpy.error("Clustered variance estimation error: Invalid dependent column name!")
if indvar is None or (not isinstance(indvar, str)) or indvar.lower() in ('null', ''):
plpy.error("Clustered variance estimation error: Invalid independent column name!")
# try:
# plpy.execute("select {indvar}".format(indvar = indvar))
# isconst = True
# except:
# isconst = False
# if isconst:
# plpy.error("Clustered variance estimation error: independent variable is a constant!")
# try:
# plpy.execute("select {depvar}".format(depvar = depvar))
# notconst = False
# except:
# notconst = True
# if notconst is False:
# plpy.error("Clustered variance estimation error: dependent variable is a constant!")
# try:
# plpy.execute("select {indvar} from {tbl} limit 1".format(indvar = indvar, tbl = tbl_data))
# success = True
# except:
# success = False
# if success is False:
# plpy.error("Clustered variance estimation error: independent variable does not exist in the data table!")
# try:
# plpy.execute("select {depvar} from {tbl} limit 1".format(depvar = depvar, tbl = tbl_data))
# success = True
# except:
# success = False
# if not success:
# plpy.error("Clustered variance estimation error: dependent variable does not exist in the data table!")
if not scalar_col_has_no_null(tbl_data, depvar):
plpy.error("Clustered variance estimation error: Dependent variable has Null values! \
Please filter out Null values before using this function!")
if clustervar is None or clustervar.lower() in ('null', ''):
# clustervar is optional but if provided should be valid column name
plpy.error("Clustered variance estimation error: Invalid cluster columns name!")
if clustervar is not None:
if not columns_exist_in_table(tbl_data,
_string_to_array(clustervar), schema_madlib):
plpy.error("Clustered variance estimation error: Cluster column does not exist!")
if grouping_col is not None and grouping_col.lower() in ('null', ''):
# grouping_col is optional but if provided should be valid column name
plpy.error("Clustered variance estimation error: Invalid grouping columns name!")
if grouping_col:
if not columns_exist_in_table(tbl_data,
_string_to_array(grouping_col), schema_madlib):
plpy.error("Clustered variance estimation error: Grouping column does not exist!")
# ========================================================================
def validate_args_clustered_variance_linregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col):
"""
Validate the parameters
"""
validate_args_clustered_variance(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col)
# ========================================================================
def clustered_variance_linregr_help (schema_madlib, msg = None, **kwargs):
"""
Print help messages
"""
if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
return """
----------------------------------------------------------------
Summary
----------------------------------------------------------------
Computes the clustered standard errors for linear regression.
The function first runs a linear regression to get the fitting
coefficients. Then it computes the clustered standard errors for
the linear regression.
SELECT {schema_madlib}.clustered_variance_linregr(
'tbl_data',
'tbl_output',
'depvar',
'indvar',
'clustervar',
'grouping_col'
);
--
Run:
SELECT {schema_madlib}.clustered_variance_linregr('usage');
to get more information.
""".format(schema_madlib = schema_madlib)
if msg.strip(' ') == 'usage':
return """
Usage:
----------------------------------------------------------------
SELECT {schema_madlib}.clustered_variance_linregr(
'tbl_data', -- Name of data table
'tbl_output', -- Name of result table (raise an error if it already exists)
'depvar', -- Expression for dependent variable
'indvar', -- Expression for independent variables
'clustervar', -- Column names for cluster variables, separated by comma
'grouping_col' -- Grouping regression column names, separated by comma, default NULL
);
Output:
----------------------------------------------------------------
The output table has the following columns:
coef DOUBLE PRECISION[], -- Fitting coefficients
std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
t_stats DOUBLE PRECISION[], -- t-stats of the errors
p_values DOUBLE PRECISION[] -- p-values of the errors
""".format(schema_madlib = schema_madlib)
# ========================================================================
# Clustered errors for logistic regression
# ========================================================================
def clustered_variance_logregr (schema_madlib, tbl_data, tbl_output,
depvar, indvar, clustervar, grouping_col,
max_iter, optimizer, tolerance, verbose,
**kwargs):
"""
Logistic regression clustered standard errors
"""
validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col,
max_iter, optimizer, tolerance,
verbose)
(fitres, coef_str, cluster_grouping_str,
grouping_col_str) = __prepare_strings(clustervar, grouping_col)
plpy.execute(
"""
select {schema_madlib}.logregr_train('{tbl_data}',
'{fitres}', '{depvar}', '{indvar}', {grouping_col},
{max_iter}, '{optimizer}', {tolerance}, {verbose})
""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
fitres = fitres, depvar = depvar, indvar = indvar,
grouping_col = grouping_col_str, max_iter = max_iter,
optimizer = optimizer, tolerance = tolerance,
verbose = verbose))
old_msg_level = plpy.execute("select setting from pg_settings where \
name='client_min_messages'")[0]['setting']
plpy.execute("set client_min_messages to error")
plpy.execute(
__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
indvar = indvar, coef_str = coef_str,
tbl_data = tbl_data, fitres = fitres,
cluster_grouping_str = cluster_grouping_str,
tbl_output = tbl_output, regr_type = "log"))
plpy.execute(
"""
drop table if exists {fitres}
""".format(fitres = fitres))
plpy.execute("set client_min_messages to " + old_msg_level)
return None
# ========================================================================
def validate_args_clustered_variance_logregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col,
max_iter, optimizer, tolerance,
verbose):
"""
Validate the parameters
"""
validate_args_clustered_variance(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col)
if max_iter is None or max_iter <= 0:
plpy.error("Clustered variance estimation error: Maximum number of iterations must be a positive number!")
if tolerance is None or tolerance < 0:
plpy.error("Clustered variance estimation error: The tolerance must be a non-negative number!")
if verbose not in [True, False]:
plpy.error("Clustered variance estimation error: verbose option must be a boolean!")
if optimizer == "newton":
optimizer = "irls"
elif optimizer not in ("irls", "cg", "igd"):
plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
Must be 'newton'/'irls', 'cg', or 'igd'.
""")
# ========================================================================
def clustered_variance_logregr_help (schema_madlib, msg = None, **kwargs):
"""
Print help messages
"""
if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
return """
----------------------------------------------------------------
Summary
----------------------------------------------------------------
Computes the clustered standard errors for logistic regression.
The function first runs a logistic regression to get the fitting
coefficients. Then it computes the clustered standard errors for
the logistic regression.
SELECT {schema_madlib}.clustered_variance_logregr(
'tbl_data',
'tbl_output',
'depvar',
'indvar',
'clustervar',
'grouping_col',
max_iter,
'optimizer',
tolerance,
verbose
);
--
Run:
SELECT {schema_madlib}.clustered_variance_logregr('usage');
to get more information.
""".format(schema_madlib = schema_madlib)
if msg.strip(' ') == 'usage':
return """
Usage:
----------------------------------------------------------------
SELECT {schema_madlib}.clustered_variance_logregr(
'tbl_data', -- Name of data table
'tbl_output', -- Name of result table (raise an error if it already exists)
'depvar', -- Expression for dependent variable
'indvar', -- Expression for independent variables
'clustervar', -- Column names for cluster variables, separated by comma
'grouping_col', -- Grouping regression column names, separated by comma, default NULL
max_iter, -- Maximum iteration number for logistic regression, default 20
'optimizer', -- Optimization method for logistic regression, default 'irls'
tolerance, -- When difference of likelihoods in two consecutive iterations smaller than
-- this value, stops the computation. Default 0.0001
verbose -- Whether print detailed information when computing logistic regression,
-- default is False
);
Output:
----------------------------------------------------------------
The output table has the following columns:
coef DOUBLE PRECISION[], -- Fitting coefficients
std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
z_stats DOUBLE PRECISION[], -- z-stats of the errors
p_values DOUBLE PRECISION[] -- p-values of the errors
""".format(schema_madlib = schema_madlib)
# ========================================================================
# Clustered errors for multi-logistic regression
# ========================================================================
#NOTE. Because of the current mlogregr interface, the "verbose" parameter doesn't actually do anything at the moment
def clustered_variance_mlogregr (schema_madlib, tbl_data, tbl_output,
depvar, indvar, clustervar, ref_category, grouping_col,
max_iter, optimizer, tolerance, verbose,
**kwargs):
"""
Multi-Logistic regression clustered standard errors
"""
validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col,
max_iter, optimizer, tolerance,
verbose)
(fitres, coef_str, cluster_grouping_str,
grouping_col_str) = __prepare_strings(clustervar, grouping_col)
old_msg_level = plpy.execute("select setting from pg_settings where \
name='client_min_messages'")[0]['setting']
plpy.execute("set client_min_messages to error")
#The multi-logistic has a different interface than the rest of the regressions, so we have to create a table to hold the result
plpy.execute("""
create table {fitres}(coef float8[]);
""".format(fitres = fitres))
plpy.execute(
"""
insert into {fitres} (select (select coef from {schema_madlib}.mlogregr('{tbl_data}',
'{depvar}', '{indvar}',
{max_iter}, '{optimizer}', {tolerance}, {ref_category})))
""".format(schema_madlib = schema_madlib, tbl_data = tbl_data,
fitres = fitres, depvar = depvar, indvar = indvar,
max_iter = max_iter,
optimizer = optimizer, tolerance = tolerance,
ref_category = ref_category))
plpy.execute(
__generate_clustered_sql(schema_madlib = schema_madlib, depvar = depvar,
indvar = indvar, coef_str = coef_str,
tbl_data = tbl_data, fitres = fitres,
cluster_grouping_str = cluster_grouping_str,
tbl_output = tbl_output, ref_category=ref_category,
regr_type = "mlog"))
plpy.execute(
"""
drop table if exists {fitres}
""".format(fitres = fitres))
##Add in the reference category to the output column.
plpy.execute(
"""
ALTER table {tbl_output} add column ref_category INTEGER;
""".format(tbl_output = tbl_output)
)
plpy.execute(
"""
update {tbl_output} set ref_category={ref_category};
""".format(tbl_output = tbl_output, ref_category=ref_category)
)
plpy.execute("set client_min_messages to " + old_msg_level)
return None
# ========================================================================
def validate_args_clustered_variance_mlogregr(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col,
max_iter, optimizer, tolerance,
verbose):
"""
Validate the parameters
"""
validate_args_clustered_variance(schema_madlib, tbl_data,
tbl_output, depvar, indvar,
clustervar, grouping_col)
if max_iter <= 0:
plpy.error("Clustered variance estimation error: Maximum number of iterations must be positive!")
if tolerance < 0:
plpy.error("Clustered variance estimation error: The tolerance cannot be negative!")
if optimizer == "newton":
optimizer = "irls"
elif optimizer not in ("irls"):
plpy.error(""" Clustered variance estimation error: Unknown optimizer requested.
Must be 'newton'/'irls'.
""")
# ========================================================================
def clustered_variance_mlogregr_help (schema_madlib, msg = None, **kwargs):
"""
Print help messages
"""
if msg is None or msg.strip(' ') == 'help' or msg.strip(' ') == '?':
return """
----------------------------------------------------------------
Summary
----------------------------------------------------------------
Computes the clustered standard errors for multi-logistic regression.
The function first runs a multi-logistic regression to get the fitting
coefficients. Then it computes the clustered standard errors for
the multi-logistic regression.
SELECT {schema_madlib}.clustered_variance_mlogregr(
'tbl_data',
'tbl_output',
'depvar',
'indvar',
'clustervar',
'ref_category',
'grouping_col',
max_iter,
'optimizer',
tolerance,
verbose
);
--
Run:
SELECT {schema_madlib}.clustered_variance_mlogregr('usage');
to get more information.
""".format(schema_madlib = schema_madlib)
if msg.strip(' ') == 'usage':
return """
Usage:
----------------------------------------------------------------
SELECT {schema_madlib}.clustered_variance_mlogregr(
'tbl_data', -- Name of data table
'tbl_output', -- Name of result table (raise an error if it already exists)
'depvar', -- Expression for dependent variable
'indvar', -- Expression for independent variables
'clustervar', -- Column names for cluster variables, separated by comma
'ref_category', -- Reference category for the multi-logistic regression
'grouping_col', -- Grouping regression column names, separated by comma, default NULL
max_iter, -- Maximum iteration number for logistic regression, default 20
'optimizer', -- Optimization method for logistic regression, default 'irls'
tolerance, -- When difference of likelihoods in two consecutive iterations smaller than
-- this value, stops the computation. Default 0.0001
verbose -- Whether print detailed information when computing logistic regression,
-- default is False
);
Output:
----------------------------------------------------------------
The output table has the following columns:
ref_category INTEGER -- The reference category used in the multi-logistic regression
coef DOUBLE PRECISION[], -- Fitting coefficients
std_err DOUBLE PRECISION[], -- Clustered standard errors for coef
z_stats DOUBLE PRECISION[], -- z-stats of the errors
p_values DOUBLE PRECISION[] -- p-values of the errors
""".format(schema_madlib = schema_madlib)