| # coding=utf-8 |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """ |
| @file create_indicators.py_in |
| |
| @brief Contains the main interface function |
| and other functions that are common to the various methods and related to |
| database constructs. |
| |
| @namespace create_indicators |
| """ |
| import plpy |
| from control import MinWarning |
| from utilities import _assert |
| from utilities import get_distribution_policy |
| from utilities import split_quoted_delimited_str |
| from utilities import strip_end_quotes |
| from validate_args import table_exists |
| from validate_args import columns_exist_in_table |
| from validate_args import table_is_empty |
| from validate_args import _get_table_schema_names |
| from validate_args import get_first_schema |
| |
| m4_changequote(`<!', `!>') |
| is_postgresql = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>) |
| # ----------------------------------------------------------------------- |
| # Deprecated functions corresponding to "create_indicators.sql_in" |
| # ----------------------------------------------------------------------- |
| |
| |
| def create_indicator_variables(schema_madlib, source_table, out_table, |
| categorical_cols, distributed_by=None, |
| keep_null=False, **kwargs): |
| """ |
| Helper function that can be used to create dummy coding for |
| categorical variables. |
| Args: |
| @param source_table The original data table |
| @param out_table The output table that contains the dummy |
| variable columns |
| @param categorical_cols A string, comma separated column names for |
| for categorical variables |
| """ |
| with MinWarning('warning'): |
| plpy.warning("This function has been deprecated. " |
| "Please use `encode_categorical_variables` for encoding categoricals.") |
| cols = split_quoted_delimited_str(categorical_cols) |
| validate_dummy_coding(source_table, out_table, cols) |
| |
| sql_list = ["CREATE TABLE " + out_table + " AS (SELECT *"] |
| for col in cols: |
| col_no_quotes = strip_end_quotes(col.strip()) |
| distinct_values = plpy.execute( |
| "SELECT {col} AS value FROM {source_table} " |
| "GROUP BY {col} ORDER BY {col}". |
| format(col=col, source_table=source_table)) |
| distinct_values = [strip_end_quotes(item['value']) for item in distinct_values] |
| null_wrap_case_str = "" |
| null_wrap_end_str = "" |
| if not keep_null and None in distinct_values: |
| null_wrap_case_str = "CASE WHEN \"{col}\" is NULL THEN NULL ELSE " |
| null_wrap_end_str = "END" |
| case_str = ("({wrap_case} " |
| "CASE WHEN \"{{col}}\" = '{{value}}' THEN 1 ELSE 0 END " |
| "{wrap_end})". |
| format(wrap_case=null_wrap_case_str, |
| wrap_end=null_wrap_end_str)) |
| sql_list.append( |
| ", " + |
| ', '.join("{case_str} as \"{{col}}_{{value}}\"". |
| format(case_str=case_str). |
| format(col=col_no_quotes, value=str(value)) |
| for value in distinct_values if value is not None)) |
| if keep_null and None in distinct_values: |
| sql_list.append(", (CASE WHEN \"{0}\" IS NULL THEN 1 ELSE 0 END) " |
| "as \"{0}_NULL\"".format(col_no_quotes)) |
| sql_list.append(" FROM " + source_table + ") ") |
| |
| if not is_postgresql: |
| if distributed_by: |
| dist_str = distributed_by |
| else: |
| dist_str = ','.join(['"%s"' % i |
| for i in get_distribution_policy(source_table) |
| if i is not None]) |
| if dist_str: |
| sql_list.append("distributed by (" + dist_str + ")") |
| else: |
| sql_list.append("distributed randomly") |
| plpy.execute(''.join(sql_list)) |
| return None |
| # --------------------------------------------------------------- |
| |
| |
| def validate_dummy_coding(source_table, out_table, cols): |
| """ |
| Args: |
| @param source_table The original data table |
| @param out_table The output table that will contain dummy columns |
| @param cols An array of categorical column names |
| """ |
| _assert(out_table and |
| out_table.strip().lower() not in ('null', ''), |
| "Invalid output table name!") |
| _assert(not table_exists(out_table), |
| "Output table already exists!") |
| _assert(source_table and source_table.strip().lower() not in ('null', ''), |
| "Invalid data table name!") |
| _assert(table_exists(source_table), |
| "Data table ({0}) is missing!". format(source_table)) |
| _assert(not table_is_empty(source_table), |
| "Data table ({0}) is empty!". format(source_table)) |
| _assert(columns_exist_in_table(source_table, cols), |
| "Not all columns from {0} present in source table ({1})" |
| .format(cols, source_table)) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def indicator_variables_help(schema_madlib, message, **kwargs): |
| """ |
| Help function for create_indicator_variables |
| |
| Args: |
| @param schema_madlib |
| @param message: string, Help message string |
| @param kwargs |
| |
| Returns: |
| String. Help/usage information |
| """ |
| if not message: |
| help_string = """ |
| WARNING: This function has been deprecated in favor of the new |
| 'encode_categorical_variables' function. |
| |
| ----------------------------------------------------------------------- |
| SUMMARY |
| ----------------------------------------------------------------------- |
| Provide functionality to create indicator variables from categorical variables |
| to be used by regression methods. Categorical variables require special |
| attention in regression analysis because, unlike dichotomous or continuous |
| variables, they cannot by entered into the regression equation just as they are. |
| For example, if you have a variable called race that is coded 1 = Hispanic, 2 = |
| Asian 3 = Black 4 = White, then entering race in your regression will look at |
| the linear effect of race, which is probably not what you intended. Instead, |
| categorical variables like this need to be recoded into a series of indicator |
| variables which can then be entered into the regression model. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.create_indicator_variables('usage') |
| """ |
| elif message in ['usage', 'help', '?']: |
| help_string = """ |
| ----------------------------------------------------------------------- |
| USAGE |
| ----------------------------------------------------------------------- |
| SELECT {schema_madlib}.create_indicator_variables( |
| source_table, -- Name of source table containing data for categorical variables |
| out_table, -- Name of table to output data for indicator variables |
| -- (also includes other columns present in 'source_table') |
| categorical_cols, -- Comma-separated string of categorical variable column names |
| keep_null, -- (Default = False) Determines behavior for NULL value in categorical variable. |
| -- If True, then an indicator variable is created corresponding to the NULL value. |
| -- If False, then all indicator variables for that row will be set to NULL. |
| distributed_by -- (Default = NULL) Comma-separated column names to distribute output table. |
| -- If NULL then output is distributed using same policy as source_table. |
| -- Note: This parameter is not available for POSTGRESQL platform. |
| ); |
| |
| ----------------------------------------------------------------------- |
| OUTPUT |
| ----------------------------------------------------------------------- |
| The output table ('output_table' above) has all the columns present in source |
| table, plus additional columns for each indicator variable corresponding to each |
| value of a categorical variable. The column name for the indicator variable is |
| set as '<categorical name>_<categorical value>'. |
| """ |
| else: |
| help_string = "No such option. Use {schema_madlib}.create_indicator_variables()" |
| |
| return help_string.format(schema_madlib=schema_madlib) |
| # --------------------------------------------------------------------- |