src/ports/postgres/modules/utilities/create_indicators.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 @file create_indicators.py_in

 @brief Contains the main interface function
 and other functions that are common to the various methods and related to
 database constructs.

 @namespace create_indicators
 """
 import plpy
 from control import MinWarning
 from utilities import _assert
 from utilities import get_distributed_by
 from utilities import split_quoted_delimited_str
 from utilities import strip_end_quotes
 from validate_args import table_exists
 from validate_args import columns_exist_in_table
 from validate_args import table_is_empty
 from validate_args import _get_table_schema_names
 from validate_args import get_first_schema

 m4_changequote(`<!', `!>')
 is_postgresql = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
 # -----------------------------------------------------------------------
 # Deprecated functions corresponding to "create_indicators.sql_in"
 # -----------------------------------------------------------------------


 def create_indicator_variables(schema_madlib, source_table, out_table,
                                categorical_cols, distributed_by=None,
                                keep_null=False, **kwargs):
     """
     Helper function that can be used to create dummy coding for
     categorical variables.
     Args:
         @param source_table The original data table
         @param out_table    The output table that contains the dummy
                             variable columns
         @param categorical_cols   A string, comma separated column names for
                             for categorical variables
     """
     with MinWarning('warning'):
         plpy.warning("This function has been deprecated. "
                      "Please use `encode_categorical_variables` for encoding categoricals.")
         cols = split_quoted_delimited_str(categorical_cols)
         validate_dummy_coding(source_table, out_table, cols)

         sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
         for col in cols:
             col_no_quotes = strip_end_quotes(col.strip())
             distinct_values = plpy.execute(
                 "SELECT {col} AS value FROM {source_table} "
                 "GROUP BY {col} ORDER BY {col}".
                 format(col=col, source_table=source_table))
             distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
             null_wrap_case_str = ""
             null_wrap_end_str = ""
             if not keep_null and None in distinct_values:
                 null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
                 null_wrap_end_str = "END"
             case_str = ("({wrap_case} "
                         "CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
                         "{wrap_end})".
                         format(wrap_case=null_wrap_case_str,
                                wrap_end=null_wrap_end_str))
             sql_list.append(
                 ", " +
                 ', '.join("{case_str} as \"{{col}}_{{value}}\"".
                           format(case_str=case_str).
                           format(col=col_no_quotes, value=str(value))
                           for value in distinct_values if value is not None))
             if keep_null and None in distinct_values:
                 sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
                                 "as \"{0}_NULL\"".format(col_no_quotes))
         sql_list.append(" FROM " + source_table + ") ")

         if not is_postgresql:
             if distributed_by:
                 dist_str = 'distributed by (' + distributed_by + ')'
             else:
                 dist_str = get_distributed_by(source_table)
 		sql_list.append(dist_str)
         plpy.execute(''.join(sql_list))
         return None
 # ---------------------------------------------------------------


 def validate_dummy_coding(source_table, out_table, cols):
     """
     Args:
         @param source_table The original data table
         @param out_table    The output table that will contain dummy columns
         @param cols         An array of categorical column names
     """
     _assert(out_table and
             out_table.strip().lower() not in ('null', ''),
             "Invalid output table name!")
     _assert(not table_exists(out_table),
             "Output table already exists!")
     _assert(source_table and source_table.strip().lower() not in ('null', ''),
             "Invalid data table name!")
     _assert(table_exists(source_table),
             "Data table ({0}) is missing!". format(source_table))
     _assert(not table_is_empty(source_table),
             "Data table ({0}) is empty!". format(source_table))
     _assert(columns_exist_in_table(source_table, cols),
             "Not all columns from {0} present in source table ({1})"
             .format(cols, source_table))
 # ------------------------------------------------------------------------------


 def indicator_variables_help(schema_madlib, message, **kwargs):
     """
     Help function for create_indicator_variables

     Args:
         @param schema_madlib
         @param message: string, Help message string
         @param kwargs

     Returns:
         String. Help/usage information
     """
     if not message:
         help_string = """
 WARNING: This function has been deprecated in favor of the new
 'encode_categorical_variables' function.

 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Provide functionality to create indicator variables from categorical variables
 to be used by regression methods. Categorical variables require special
 attention in regression analysis because, unlike dichotomous or continuous
 variables, they cannot by entered into the regression equation just as they are.
 For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
 Asian 3 = Black 4 = White, then entering race in your regression will look at
 the linear effect of race, which is probably not what you intended. Instead,
 categorical variables like this need to be recoded into a series of indicator
 variables which can then be entered into the regression model.

 For more details on function usage:
     SELECT {schema_madlib}.create_indicator_variables('usage')
             """
     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
  SELECT {schema_madlib}.create_indicator_variables(
     source_table,       -- Name of source table containing data for categorical variables
     out_table,          -- Name of table to output data for indicator variables
                         --   (also includes other columns present in 'source_table')
     categorical_cols,   -- Comma-separated string of categorical variable column names
     keep_null,          -- (Default = False) Determines behavior for NULL value in categorical variable.
                         --   If True, then an indicator variable is created corresponding to the NULL value.
                         --   If False, then all indicator variables for that row will be set to NULL.
     distributed_by      -- (Default = NULL) Comma-separated column names to distribute output table.
                         --   If NULL then output is distributed using same policy as source_table.
                         --   Note: This parameter is not available for POSTGRESQL platform.
  );

 -----------------------------------------------------------------------
                             OUTPUT
 -----------------------------------------------------------------------
 The output table ('output_table' above) has all the columns present in source
 table, plus additional columns for each indicator variable corresponding to each
 value of a categorical variable. The column name for the indicator variable is
 set as '<categorical name>_<categorical value>'.
 """
     else:
         help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"

     return help_string.format(schema_madlib=schema_madlib)
 # ---------------------------------------------------------------------
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	@file create_indicators.py_in

	@brief Contains the main interface function
	and other functions that are common to the various methods and related to
	database constructs.

	@namespace create_indicators
	"""
	import plpy
	from control import MinWarning
	from utilities import _assert
	from utilities import get_distributed_by
	from utilities import split_quoted_delimited_str
	from utilities import strip_end_quotes
	from validate_args import table_exists
	from validate_args import columns_exist_in_table
	from validate_args import table_is_empty
	from validate_args import _get_table_schema_names
	from validate_args import get_first_schema

	m4_changequote(`<!', `!>')
	is_postgresql = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
	# -----------------------------------------------------------------------
	# Deprecated functions corresponding to "create_indicators.sql_in"
	# -----------------------------------------------------------------------


	def create_indicator_variables(schema_madlib, source_table, out_table,
	categorical_cols, distributed_by=None,
	keep_null=False, **kwargs):
	"""
	Helper function that can be used to create dummy coding for
	categorical variables.
	Args:
	@param source_table The original data table
	@param out_table The output table that contains the dummy
	variable columns
	@param categorical_cols A string, comma separated column names for
	for categorical variables
	"""
	with MinWarning('warning'):
	plpy.warning("This function has been deprecated. "
	"Please use `encode_categorical_variables` for encoding categoricals.")
	cols = split_quoted_delimited_str(categorical_cols)
	validate_dummy_coding(source_table, out_table, cols)

	sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
	for col in cols:
	col_no_quotes = strip_end_quotes(col.strip())
	distinct_values = plpy.execute(
	"SELECT {col} AS value FROM {source_table} "
	"GROUP BY {col} ORDER BY {col}".
	format(col=col, source_table=source_table))
	distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
	null_wrap_case_str = ""
	null_wrap_end_str = ""
	if not keep_null and None in distinct_values:
	null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
	null_wrap_end_str = "END"
	case_str = ("({wrap_case} "
	"CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
	"{wrap_end})".
	format(wrap_case=null_wrap_case_str,
	wrap_end=null_wrap_end_str))
	sql_list.append(
	", " +
	', '.join("{case_str} as \"{{col}}_{{value}}\"".
	format(case_str=case_str).
	format(col=col_no_quotes, value=str(value))
	for value in distinct_values if value is not None))
	if keep_null and None in distinct_values:
	sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
	"as \"{0}_NULL\"".format(col_no_quotes))
	sql_list.append(" FROM " + source_table + ") ")

	if not is_postgresql:
	if distributed_by:
	dist_str = 'distributed by (' + distributed_by + ')'
	else:
	dist_str = get_distributed_by(source_table)
	sql_list.append(dist_str)
	plpy.execute(''.join(sql_list))
	return None
	# ---------------------------------------------------------------


	def validate_dummy_coding(source_table, out_table, cols):
	"""
	Args:
	@param source_table The original data table
	@param out_table The output table that will contain dummy columns
	@param cols An array of categorical column names
	"""
	_assert(out_table and
	out_table.strip().lower() not in ('null', ''),
	"Invalid output table name!")
	_assert(not table_exists(out_table),
	"Output table already exists!")
	_assert(source_table and source_table.strip().lower() not in ('null', ''),
	"Invalid data table name!")
	_assert(table_exists(source_table),
	"Data table ({0}) is missing!". format(source_table))
	_assert(not table_is_empty(source_table),
	"Data table ({0}) is empty!". format(source_table))
	_assert(columns_exist_in_table(source_table, cols),
	"Not all columns from {0} present in source table ({1})"
	.format(cols, source_table))
	# ------------------------------------------------------------------------------


	def indicator_variables_help(schema_madlib, message, **kwargs):
	"""
	Help function for create_indicator_variables

	Args:
	@param schema_madlib
	@param message: string, Help message string
	@param kwargs

	Returns:
	String. Help/usage information
	"""
	if not message:
	help_string = """
	WARNING: This function has been deprecated in favor of the new
	'encode_categorical_variables' function.

	-----------------------------------------------------------------------
	SUMMARY
	-----------------------------------------------------------------------
	Provide functionality to create indicator variables from categorical variables
	to be used by regression methods. Categorical variables require special
	attention in regression analysis because, unlike dichotomous or continuous
	variables, they cannot by entered into the regression equation just as they are.
	For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
	Asian 3 = Black 4 = White, then entering race in your regression will look at
	the linear effect of race, which is probably not what you intended. Instead,
	categorical variables like this need to be recoded into a series of indicator
	variables which can then be entered into the regression model.

	For more details on function usage:
	SELECT {schema_madlib}.create_indicator_variables('usage')
	"""
	elif message in ['usage', 'help', '?']:
	help_string = """
	-----------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.create_indicator_variables(
	source_table, -- Name of source table containing data for categorical variables
	out_table, -- Name of table to output data for indicator variables
	-- (also includes other columns present in 'source_table')
	categorical_cols, -- Comma-separated string of categorical variable column names
	keep_null, -- (Default = False) Determines behavior for NULL value in categorical variable.
	-- If True, then an indicator variable is created corresponding to the NULL value.
	-- If False, then all indicator variables for that row will be set to NULL.
	distributed_by -- (Default = NULL) Comma-separated column names to distribute output table.
	-- If NULL then output is distributed using same policy as source_table.
	-- Note: This parameter is not available for POSTGRESQL platform.
	);

	-----------------------------------------------------------------------
	OUTPUT
	-----------------------------------------------------------------------
	The output table ('output_table' above) has all the columns present in source
	table, plus additional columns for each indicator variable corresponding to each
	value of a categorical variable. The column name for the indicator variable is
	set as '<categorical name>_<categorical value>'.
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"

	return help_string.format(schema_madlib=schema_madlib)
	# ---------------------------------------------------------------------