blob: b01b53497f36339741d0ba654bbef32d8411fd3b [file] [log] [blame]
"""
@file dummy_coding.py_in
@brief Marginal Effects with Interactions: Contains the main interface function
and other functions that are common to the various methods and related to
database constructs.
@namespace marginal
"""
import plpy
from utilities import _assert
from utilities import split_quoted_delimited_str
from utilities import strip_end_quotes
from validate_args import table_exists
from validate_args import columns_exist_in_table
from validate_args import table_is_empty
from validate_args import _get_table_schema_names
from validate_args import get_first_schema
m4_changequote(`<!', `!>')
def get_distribution_policy(schema_madlib, source_table):
""" Return a list of columns that define the distribution policy of table
Args:
@param source_table
Returns:
List of str.
"""
_, table_name = _get_table_schema_names(source_table)
schema_name = get_first_schema(source_table)
dist_attr = plpy.execute("""
SELECT array_agg(pga.attname) as dist_attr
FROM (
SELECT gdp.localoid,
CASE
WHEN ( ARRAY_UPPER(gdp.attrnums, 1) > 0 ) THEN
UNNEST(gdp.attrnums)
ELSE NULL
END AS attnum
FROM gp_distribution_policy gdp
) AS distkey
INNER JOIN pg_class AS pgc
ON distkey.localoid = pgc.oid AND pgc.relname = '{table_name}'
INNER JOIN pg_namespace pgn
ON pgc.relnamespace = pgn.oid AND pgn.nspname = '{schema_name}'
LEFT OUTER JOIN pg_attribute pga
ON distkey.attnum = pga.attnum AND distkey.localoid = pga.attrelid
""".format(table_name=table_name, schema_name=schema_name))[0]["dist_attr"]
return dist_attr
#------------------------------------------------------------------------------
def create_indicator_variables(schema_madlib, source_table, out_table,
categorical_cols, distributed_by=None,
keep_null=False, **kwargs):
"""
Helper function that can be used to create dummy coding for
categorical variables.
Args:
@param source_table The original data table
@param out_table The output table that contains the dummy
variable columns
@param categorical_cols A string, comma separated column names for
for categorical variables
"""
cols = split_quoted_delimited_str(categorical_cols)
validate_dummy_coding(source_table, out_table, cols)
sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
for col in cols:
col_no_quotes = strip_end_quotes(col.strip())
distinct_values = plpy.execute(
"SELECT {col} AS value FROM {source_table} "
"GROUP BY {col} ORDER BY {col}".
format(col=col, source_table=source_table))
distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
null_wrap_case_str = ""
null_wrap_end_str = ""
if not keep_null and None in distinct_values:
null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
null_wrap_end_str = "END"
case_str = ("({wrap_case} "
"CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
"{wrap_end})".
format(wrap_case=null_wrap_case_str,
wrap_end=null_wrap_end_str))
sql_list.append(
", " +
', '.join("{case_str} as \"{{col}}_{{value}}\"".
format(case_str=case_str).
format(col=col_no_quotes, value=str(value))
for value in distinct_values if value is not None))
if keep_null and None in distinct_values:
sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
"as \"{0}_NULL\"".format(col_no_quotes))
sql_list.append(" FROM " + source_table + ") ")
m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!
if distributed_by:
dist_str = distributed_by
else:
dist_str = ','.join(['"%s"'%i for i in get_distribution_policy(schema_madlib, source_table)
if i is not None])
if dist_str:
sql_list.append("distributed by (" + dist_str + ")")
else:
sql_list.append("distributed randomly")
!>)
plpy.execute(''.join(sql_list))
return None
# ---------------------------------------------------------------
def validate_dummy_coding(source_table, out_table, cols):
"""
Args:
@param source_table The original data table
@param out_table The output table that will contain dummy columns
@param cols An array of categorical column names
"""
_assert(out_table and
out_table.strip().lower() not in ('null', ''),
"Invalid output table name!")
_assert(not table_exists(out_table),
"Output table already exists!")
_assert(source_table and source_table.strip().lower() not in ('null', ''),
"Invalid data table name!")
_assert(table_exists(source_table),
"Data table ({0}) is missing!". format(source_table))
_assert(not table_is_empty(source_table),
"Data table ({0}) is empty!". format(source_table))
_assert(columns_exist_in_table(source_table, cols),
"Not all columns from {0} present in source table ({1})"
.format(cols, source_table))
#------------------------------------------------------------------------------
def indicator_variables_help(schema_madlib, message, **kwargs):
"""
Help function for create_indicator_variables
Args:
@param schema_madlib
@param message: string, Help message string
@param kwargs
Returns:
String. Help/usage information
"""
if not message:
help_string = """
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Provide functionality to create indicator variables from categorical variables
to be used by regression methods. Categorical variables require special
attention in regression analysis because, unlike dichotomous or continuous
variables, they cannot by entered into the regression equation just as they are.
For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
Asian 3 = Black 4 = White, then entering race in your regression will look at
the linear effect of race, which is probably not what you intended. Instead,
categorical variables like this need to be recoded into a series of indicator
variables which can then be entered into the regression model.
For more details on function usage:
SELECT {schema_madlib}.create_indicator_variables('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.create_indicator_variables(
source_table, -- Name of source table containing data for categorical variables
out_table, -- Name of table to output data for indicator variables
-- (also includes other columns present in 'source_table')
categorical_cols, -- Comma-separated string of categorical variable column names
keep_null, -- (Default = False) Determines behavior for NULL value in categorical variable.
-- If True, then an indicator variable is created corresponding to the NULL value.
-- If False, then all indicator variables for that row will be set to NULL.
distributed_by -- (Default = NULL) Comma-separated column names to distribute output table.
-- If NULL then output is distributed using same policy as source_table.
-- Note: This parameter is not available for POSTGRESQL platform.
);
-----------------------------------------------------------------------
OUTPUT
-----------------------------------------------------------------------
The output table ('output_table' above) has all the columns present in source
table, plus additional columns for each indicator variable corresponding to each
value of a categorical variable. The column name for the indicator variable is
set as '<categorical name>_<categorical value>'.
"""
else:
help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"
return help_string.format(schema_madlib=schema_madlib)
# ---------------------------------------------------------------------