src/ports/postgres/modules/utilities/data_preparation.py_in - madlib - Git at Google

 """
 @file dummy_coding.py_in

 @brief Marginal Effects with Interactions: Contains the main interface function
 and other functions that are common to the various methods and related to
 database constructs.

 @namespace marginal
 """
 import plpy
 from utilities import _assert
 from utilities import split_quoted_delimited_str
 from utilities import strip_end_quotes
 from validate_args import table_exists
 from validate_args import columns_exist_in_table
 from validate_args import table_is_empty
 from validate_args import _get_table_schema_names
 from validate_args import get_first_schema

 m4_changequote(`<!', `!>')


 def get_distribution_policy(schema_madlib, source_table):
     """ Return a list of columns that define the distribution policy of table
     Args:
         @param source_table

     Returns:
         List of str.
     """
     _, table_name = _get_table_schema_names(source_table)
     schema_name = get_first_schema(source_table)
     dist_attr = plpy.execute("""
     SELECT array_agg(pga.attname) as dist_attr
     FROM (
         SELECT gdp.localoid,
                  CASE
                      WHEN ( ARRAY_UPPER(gdp.attrnums, 1) > 0 ) THEN
                         UNNEST(gdp.attrnums)
                      ELSE NULL
                  END AS attnum
             FROM gp_distribution_policy gdp
         ) AS distkey
         INNER JOIN pg_class AS pgc
         ON distkey.localoid = pgc.oid AND pgc.relname = '{table_name}'
         INNER JOIN pg_namespace pgn
         ON pgc.relnamespace = pgn.oid AND pgn.nspname = '{schema_name}'
         LEFT OUTER JOIN pg_attribute pga
         ON distkey.attnum = pga.attnum AND distkey.localoid = pga.attrelid
     """.format(table_name=table_name, schema_name=schema_name))[0]["dist_attr"]
     return dist_attr
 #------------------------------------------------------------------------------


 def create_indicator_variables(schema_madlib, source_table, out_table,
                  categorical_cols, distributed_by=None,
                  keep_null=False, **kwargs):
     """
     Helper function that can be used to create dummy coding for
     categorical variables.
     Args:
         @param source_table The original data table
         @param out_table    The output table that contains the dummy
                             variable columns
         @param categorical_cols   A string, comma separated column names for
                             for categorical variables
     """
     cols = split_quoted_delimited_str(categorical_cols)
     validate_dummy_coding(source_table, out_table, cols)

     sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
     for col in cols:
         col_no_quotes = strip_end_quotes(col.strip())
         distinct_values = plpy.execute(
             "SELECT {col} AS value FROM {source_table} "
             "GROUP BY {col} ORDER BY {col}".
             format(col=col, source_table=source_table))
         distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
         null_wrap_case_str = ""
         null_wrap_end_str = ""
         if not keep_null and None in distinct_values:
             null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
             null_wrap_end_str = "END"
         case_str = ("({wrap_case} "
                     "CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
                     "{wrap_end})".
                     format(wrap_case=null_wrap_case_str,
                            wrap_end=null_wrap_end_str))
         sql_list.append(
             ", " +
             ', '.join("{case_str} as \"{{col}}_{{value}}\"".
                       format(case_str=case_str).
                       format(col=col_no_quotes, value=str(value))
                       for value in distinct_values if value is not None))
         if keep_null and None in distinct_values:
             sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
                             "as \"{0}_NULL\"".format(col_no_quotes))
     sql_list.append(" FROM " + source_table + ") ")
     m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!
     if distributed_by:
         dist_str = distributed_by
     else:
         dist_str = ','.join(['"%s"'%i for i in get_distribution_policy(schema_madlib, source_table)
                              if i is not None])
     if dist_str:
         sql_list.append("distributed by (" + dist_str + ")")
     else:
         sql_list.append("distributed randomly")
     !>)
     plpy.execute(''.join(sql_list))
     return None
 # ---------------------------------------------------------------


 def validate_dummy_coding(source_table, out_table, cols):
     """
     Args:
         @param source_table The original data table
         @param out_table    The output table that will contain dummy columns
         @param cols         An array of categorical column names
     """
     _assert(out_table and
             out_table.strip().lower() not in ('null', ''),
             "Invalid output table name!")
     _assert(not table_exists(out_table),
             "Output table already exists!")
     _assert(source_table and source_table.strip().lower() not in ('null', ''),
             "Invalid data table name!")
     _assert(table_exists(source_table),
             "Data table ({0}) is missing!". format(source_table))
     _assert(not table_is_empty(source_table),
             "Data table ({0}) is empty!". format(source_table))
     _assert(columns_exist_in_table(source_table, cols),
             "Not all columns from {0} present in source table ({1})"
             .format(cols, source_table))
 #------------------------------------------------------------------------------


 def indicator_variables_help(schema_madlib, message, **kwargs):
     """
     Help function for create_indicator_variables

     Args:
         @param schema_madlib
         @param message: string, Help message string
         @param kwargs

     Returns:
         String. Help/usage information
     """
     if not message:
         help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Provide functionality to create indicator variables from categorical variables
 to be used by regression methods. Categorical variables require special
 attention in regression analysis because, unlike dichotomous or continuous
 variables, they cannot by entered into the regression equation just as they are.
 For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
 Asian 3 = Black 4 = White, then entering race in your regression will look at
 the linear effect of race, which is probably not what you intended. Instead,
 categorical variables like this need to be recoded into a series of indicator
 variables which can then be entered into the regression model.

 For more details on function usage:
     SELECT {schema_madlib}.create_indicator_variables('usage')
             """
     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
  SELECT {schema_madlib}.create_indicator_variables(
     source_table,       -- Name of source table containing data for categorical variables
     out_table,          -- Name of table to output data for indicator variables
                         --   (also includes other columns present in 'source_table')
     categorical_cols,   -- Comma-separated string of categorical variable column names
     keep_null,          -- (Default = False) Determines behavior for NULL value in categorical variable.
                         --   If True, then an indicator variable is created corresponding to the NULL value.
                         --   If False, then all indicator variables for that row will be set to NULL.
     distributed_by      -- (Default = NULL) Comma-separated column names to distribute output table.
                         --   If NULL then output is distributed using same policy as source_table.
                         --   Note: This parameter is not available for POSTGRESQL platform.
  );

 -----------------------------------------------------------------------
                             OUTPUT
 -----------------------------------------------------------------------
 The output table ('output_table' above) has all the columns present in source
 table, plus additional columns for each indicator variable corresponding to each
 value of a categorical variable. The column name for the indicator variable is
 set as '<categorical name>_<categorical value>'.
 """
     else:
         help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"

     return help_string.format(schema_madlib=schema_madlib)
 # ---------------------------------------------------------------------
	"""
	@file dummy_coding.py_in

	@brief Marginal Effects with Interactions: Contains the main interface function
	and other functions that are common to the various methods and related to
	database constructs.

	@namespace marginal
	"""
	import plpy
	from utilities import _assert
	from utilities import split_quoted_delimited_str
	from utilities import strip_end_quotes
	from validate_args import table_exists
	from validate_args import columns_exist_in_table
	from validate_args import table_is_empty
	from validate_args import _get_table_schema_names
	from validate_args import get_first_schema

	m4_changequote(`<!', `!>')


	def get_distribution_policy(schema_madlib, source_table):
	""" Return a list of columns that define the distribution policy of table
	Args:
	@param source_table

	Returns:
	List of str.
	"""
	_, table_name = _get_table_schema_names(source_table)
	schema_name = get_first_schema(source_table)
	dist_attr = plpy.execute("""
	SELECT array_agg(pga.attname) as dist_attr
	FROM (
	SELECT gdp.localoid,
	CASE
	WHEN ( ARRAY_UPPER(gdp.attrnums, 1) > 0 ) THEN
	UNNEST(gdp.attrnums)
	ELSE NULL
	END AS attnum
	FROM gp_distribution_policy gdp
	) AS distkey
	INNER JOIN pg_class AS pgc
	ON distkey.localoid = pgc.oid AND pgc.relname = '{table_name}'
	INNER JOIN pg_namespace pgn
	ON pgc.relnamespace = pgn.oid AND pgn.nspname = '{schema_name}'
	LEFT OUTER JOIN pg_attribute pga
	ON distkey.attnum = pga.attnum AND distkey.localoid = pga.attrelid
	""".format(table_name=table_name, schema_name=schema_name))[0]["dist_attr"]
	return dist_attr
	#------------------------------------------------------------------------------


	def create_indicator_variables(schema_madlib, source_table, out_table,
	categorical_cols, distributed_by=None,
	keep_null=False, **kwargs):
	"""
	Helper function that can be used to create dummy coding for
	categorical variables.
	Args:
	@param source_table The original data table
	@param out_table The output table that contains the dummy
	variable columns
	@param categorical_cols A string, comma separated column names for
	for categorical variables
	"""
	cols = split_quoted_delimited_str(categorical_cols)
	validate_dummy_coding(source_table, out_table, cols)

	sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
	for col in cols:
	col_no_quotes = strip_end_quotes(col.strip())
	distinct_values = plpy.execute(
	"SELECT {col} AS value FROM {source_table} "
	"GROUP BY {col} ORDER BY {col}".
	format(col=col, source_table=source_table))
	distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
	null_wrap_case_str = ""
	null_wrap_end_str = ""
	if not keep_null and None in distinct_values:
	null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
	null_wrap_end_str = "END"
	case_str = ("({wrap_case} "
	"CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
	"{wrap_end})".
	format(wrap_case=null_wrap_case_str,
	wrap_end=null_wrap_end_str))
	sql_list.append(
	", " +
	', '.join("{case_str} as \"{{col}}_{{value}}\"".
	format(case_str=case_str).
	format(col=col_no_quotes, value=str(value))
	for value in distinct_values if value is not None))
	if keep_null and None in distinct_values:
	sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
	"as \"{0}_NULL\"".format(col_no_quotes))
	sql_list.append(" FROM " + source_table + ") ")
	m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!
	if distributed_by:
	dist_str = distributed_by
	else:
	dist_str = ','.join(['"%s"'%i for i in get_distribution_policy(schema_madlib, source_table)
	if i is not None])
	if dist_str:
	sql_list.append("distributed by (" + dist_str + ")")
	else:
	sql_list.append("distributed randomly")
	!>)
	plpy.execute(''.join(sql_list))
	return None
	# ---------------------------------------------------------------


	def validate_dummy_coding(source_table, out_table, cols):
	"""
	Args:
	@param source_table The original data table
	@param out_table The output table that will contain dummy columns
	@param cols An array of categorical column names
	"""
	_assert(out_table and
	out_table.strip().lower() not in ('null', ''),
	"Invalid output table name!")
	_assert(not table_exists(out_table),
	"Output table already exists!")
	_assert(source_table and source_table.strip().lower() not in ('null', ''),
	"Invalid data table name!")
	_assert(table_exists(source_table),
	"Data table ({0}) is missing!". format(source_table))
	_assert(not table_is_empty(source_table),
	"Data table ({0}) is empty!". format(source_table))
	_assert(columns_exist_in_table(source_table, cols),
	"Not all columns from {0} present in source table ({1})"
	.format(cols, source_table))
	#------------------------------------------------------------------------------


	def indicator_variables_help(schema_madlib, message, **kwargs):
	"""
	Help function for create_indicator_variables

	Args:
	@param schema_madlib
	@param message: string, Help message string
	@param kwargs

	Returns:
	String. Help/usage information
	"""
	if not message:
	help_string = """
	-----------------------------------------------------------------------
	SUMMARY
	-----------------------------------------------------------------------
	Provide functionality to create indicator variables from categorical variables
	to be used by regression methods. Categorical variables require special
	attention in regression analysis because, unlike dichotomous or continuous
	variables, they cannot by entered into the regression equation just as they are.
	For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
	Asian 3 = Black 4 = White, then entering race in your regression will look at
	the linear effect of race, which is probably not what you intended. Instead,
	categorical variables like this need to be recoded into a series of indicator
	variables which can then be entered into the regression model.

	For more details on function usage:
	SELECT {schema_madlib}.create_indicator_variables('usage')
	"""
	elif message in ['usage', 'help', '?']:
	help_string = """
	-----------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.create_indicator_variables(
	source_table, -- Name of source table containing data for categorical variables
	out_table, -- Name of table to output data for indicator variables
	-- (also includes other columns present in 'source_table')
	categorical_cols, -- Comma-separated string of categorical variable column names
	keep_null, -- (Default = False) Determines behavior for NULL value in categorical variable.
	-- If True, then an indicator variable is created corresponding to the NULL value.
	-- If False, then all indicator variables for that row will be set to NULL.
	distributed_by -- (Default = NULL) Comma-separated column names to distribute output table.
	-- If NULL then output is distributed using same policy as source_table.
	-- Note: This parameter is not available for POSTGRESQL platform.
	);

	-----------------------------------------------------------------------
	OUTPUT
	-----------------------------------------------------------------------
	The output table ('output_table' above) has all the columns present in source
	table, plus additional columns for each indicator variable corresponding to each
	value of a categorical variable. The column name for the indicator variable is
	set as '<categorical name>_<categorical value>'.
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"

	return help_string.format(schema_madlib=schema_madlib)
	# ---------------------------------------------------------------------