blob: c232e6fc5bbaebc507f11637c88db7b1700dc6f5 [file] [log] [blame]
# coding=utf-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
@file create_indicators.py_in
@brief Contains the main interface function
and other functions that are common to the various methods and related to
database constructs.
@namespace create_indicators
"""
import plpy
from control import MinWarning
from utilities import _assert
from utilities import get_distributed_by
from utilities import split_quoted_delimited_str
from utilities import strip_end_quotes
from validate_args import table_exists
from validate_args import columns_exist_in_table
from validate_args import table_is_empty
from validate_args import _get_table_schema_names
from validate_args import get_first_schema
m4_changequote(`<!', `!>')
is_postgresql = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
# -----------------------------------------------------------------------
# Deprecated functions corresponding to "create_indicators.sql_in"
# -----------------------------------------------------------------------
def create_indicator_variables(schema_madlib, source_table, out_table,
categorical_cols, distributed_by=None,
keep_null=False, **kwargs):
"""
Helper function that can be used to create dummy coding for
categorical variables.
Args:
@param source_table The original data table
@param out_table The output table that contains the dummy
variable columns
@param categorical_cols A string, comma separated column names for
for categorical variables
"""
with MinWarning('warning'):
plpy.warning("This function has been deprecated. "
"Please use `encode_categorical_variables` for encoding categoricals.")
cols = split_quoted_delimited_str(categorical_cols)
validate_dummy_coding(source_table, out_table, cols)
sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"]
for col in cols:
col_no_quotes = strip_end_quotes(col.strip())
distinct_values = plpy.execute(
"SELECT {col} AS value FROM {source_table} "
"GROUP BY {col} ORDER BY {col}".
format(col=col, source_table=source_table))
distinct_values = [strip_end_quotes(item['value']) for item in distinct_values]
null_wrap_case_str = ""
null_wrap_end_str = ""
if not keep_null and None in distinct_values:
null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE "
null_wrap_end_str = "END"
case_str = ("({wrap_case} "
"CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END "
"{wrap_end})".
format(wrap_case=null_wrap_case_str,
wrap_end=null_wrap_end_str))
sql_list.append(
", " +
', '.join("{case_str} as \"{{col}}_{{value}}\"".
format(case_str=case_str).
format(col=col_no_quotes, value=str(value))
for value in distinct_values if value is not None))
if keep_null and None in distinct_values:
sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) "
"as \"{0}_NULL\"".format(col_no_quotes))
sql_list.append(" FROM " + source_table + ") ")
if not is_postgresql:
if distributed_by:
dist_str = 'distributed by (' + distributed_by + ')'
else:
dist_str = get_distributed_by(source_table)
sql_list.append(dist_str)
plpy.execute(''.join(sql_list))
return None
# ---------------------------------------------------------------
def validate_dummy_coding(source_table, out_table, cols):
"""
Args:
@param source_table The original data table
@param out_table The output table that will contain dummy columns
@param cols An array of categorical column names
"""
_assert(out_table and
out_table.strip().lower() not in ('null', ''),
"Invalid output table name!")
_assert(not table_exists(out_table),
"Output table already exists!")
_assert(source_table and source_table.strip().lower() not in ('null', ''),
"Invalid data table name!")
_assert(table_exists(source_table),
"Data table ({0}) is missing!". format(source_table))
_assert(not table_is_empty(source_table),
"Data table ({0}) is empty!". format(source_table))
_assert(columns_exist_in_table(source_table, cols),
"Not all columns from {0} present in source table ({1})"
.format(cols, source_table))
# ------------------------------------------------------------------------------
def indicator_variables_help(schema_madlib, message, **kwargs):
"""
Help function for create_indicator_variables
Args:
@param schema_madlib
@param message: string, Help message string
@param kwargs
Returns:
String. Help/usage information
"""
if not message:
help_string = """
WARNING: This function has been deprecated in favor of the new
'encode_categorical_variables' function.
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Provide functionality to create indicator variables from categorical variables
to be used by regression methods. Categorical variables require special
attention in regression analysis because, unlike dichotomous or continuous
variables, they cannot by entered into the regression equation just as they are.
For example, if you have a variable called race that is coded 1 = Hispanic, 2 =
Asian 3 = Black 4 = White, then entering race in your regression will look at
the linear effect of race, which is probably not what you intended. Instead,
categorical variables like this need to be recoded into a series of indicator
variables which can then be entered into the regression model.
For more details on function usage:
SELECT {schema_madlib}.create_indicator_variables('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.create_indicator_variables(
source_table, -- Name of source table containing data for categorical variables
out_table, -- Name of table to output data for indicator variables
-- (also includes other columns present in 'source_table')
categorical_cols, -- Comma-separated string of categorical variable column names
keep_null, -- (Default = False) Determines behavior for NULL value in categorical variable.
-- If True, then an indicator variable is created corresponding to the NULL value.
-- If False, then all indicator variables for that row will be set to NULL.
distributed_by -- (Default = NULL) Comma-separated column names to distribute output table.
-- If NULL then output is distributed using same policy as source_table.
-- Note: This parameter is not available for POSTGRESQL platform.
);
-----------------------------------------------------------------------
OUTPUT
-----------------------------------------------------------------------
The output table ('output_table' above) has all the columns present in source
table, plus additional columns for each indicator variable corresponding to each
value of a categorical variable. The column name for the indicator variable is
set as '<categorical name>_<categorical value>'.
"""
else:
help_string = "No such option. Use {schema_madlib}.create_indicator_variables()"
return help_string.format(schema_madlib=schema_madlib)
# ---------------------------------------------------------------------