src/ports/postgres/modules/utilities/encode_categorical.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # Please refer to the encode_categorical.sql_in file for the documentation

 """
 @file encode_categorical.py_in

 """
 from itertools import count
 from bisect import bisect

 import plpy
 from control import MinWarning

 from utilities import _assert
 from utilities import strip_end_quotes
 from utilities import split_quoted_delimited_str
 from utilities import extract_keyvalue_params
 from utilities import add_postfix
 from utilities import is_platform_pg

 from validate_args import input_tbl_valid, output_tbl_valid, is_var_valid
 from validate_args import unquote_ident, quote_ident
 from validate_args import get_expr_type, get_cols_and_types

 import math

 # If there are more than 1600 columns for the output table,
 # it might lead to a database error.
 MAX_OUTPUT_COLUMN_COUNT = 1600

 # If a column name has more than 63 characters it gets trimmed automatically,
 # which may cause an exception. Enable the output dictionary in this case.
 MAX_COLUMN_LENGTH = 63


 class CategoricalEncoder(object):
     """Encoding class to encode categorical variables"""
     def __init__(self,
                  schema_madlib, source_table, output_table, categorical_cols,
                  categorical_cols_to_exclude=None,
                  row_id=None,
                  top=None,
                  value_to_drop=None,
                  encode_null=False,
                  output_type=None,
                  output_dictionary=False,
                  distributed_by=None,
                  **kwargs):
         super(CategoricalEncoder, self).__init__()
         self.schema_madlib = schema_madlib
         self.source_table = source_table
         self.output_table = output_table
         self.categorical_cols = categorical_cols
         self.categorical_cols_to_exclude = categorical_cols_to_exclude
         self.row_id = row_id
         self.top = top
         self.value_to_drop = value_to_drop
         self.encode_null = encode_null

         self.output_type = 'column' if not output_type else output_type.lower()
         self.output_dictionary = output_dictionary
         self.distributed_by = distributed_by if not is_platform_pg() else None

         self._name_others_col = "_MISC__"
         self._array_out_name = "__encoded_variables__"

         # create new parameters after validating and parsing inputs
         # (order of below statements is relevant)
         self._parse_parameters()
         self._validate_parameters()

         self._output_cols, self._col_to_type = self._get_cols_to_encode()
         if not self._output_cols:
             plpy.error("Encoding categorical: No categorical columns available "
                        "to encode (or all have been excluded)")

         # _parse_dictlike_parameters uses _output_cols and has to be below
         # _get_cols_to_encode
         self._parse_dictlike_parameters()
     # ----------------------------------------------------------------------

     def build_output_table(self):
         if self._top:
             distinct_values = self._get_top_values()
         else:
             distinct_values = self._get_distinct_values()
         categorical_col_str = self._build_encoding_str(distinct_values)
         if self._output_dictionary:
             self._build_output_dictionary(distinct_values)

         if self._distributed_by:
             if self._distributed_by[0].lower() == 'randomly':
                 distribution_str = 'DISTRIBUTED RANDOMLY'
             else:
                 distribution_str = 'DISTRIBUTED BY ({0})'.format(', '.join(self._distributed_by))
         else:
             distribution_str = ''

         if self._row_id_cols:
             other_cols = ', '.join(self._row_id_cols)
         else:
             all_cols = [k for k, v in self._all_cols_types]
             other_cols = ', '.join([c for c in all_cols
                                     if (c not in self._output_cols and
                                         unquote_ident(c) not in self._output_cols)])

         if self.output_type == 'array':
             categorical_col_str = ("ARRAY[{0}] AS {1}".
                                    format(categorical_col_str,
                                           self._array_out_name))
         elif self.output_type == 'svec':
             categorical_col_str = ("ARRAY[{0}]::float8[]::{1}.svec AS {2}".
                                    format(categorical_col_str,
                                           self.schema_madlib,
                                           self._array_out_name))
         out_sql = """
             CREATE TABLE {out} AS (
                 SELECT
                     {other_cols},
                     {cols}
                 FROM
                     {src}
                 )
             {dist}
             """.format(out=self.output_table,
                        other_cols=other_cols,
                        cols=categorical_col_str,
                        src=self.source_table,
                        dist=distribution_str)
         plpy.execute(out_sql)
     # -------------------------------------------------------------------------

     def _parse_parameters(self):
         # columns to encode
         if not self.categorical_cols or self.categorical_cols.strip() == '*':
             self._categorical_cols = []
         else:
             self._categorical_cols = split_quoted_delimited_str(self.categorical_cols)
         self._categorical_cols_to_exclude = split_quoted_delimited_str(self.categorical_cols_to_exclude)

         # columns that determine the index for output table
         self._row_id_cols = split_quoted_delimited_str(self.row_id)

         # output type for specific supported types
         all_output_types = sorted(['array', 'column', 'svec'])
         try:
             # allow user to specify a prefix substring of
             # supported output types. This works because the supported
             # output types have unique prefixes.
             self.output_type = next(s for s in all_output_types
                                     if s.startswith(self.output_type))
         except StopIteration:
             # next() returns a StopIteration if no element found
             plpy.error("Encoding categorical: Output type should be one of {0}".
                        format(','.join(all_output_types)))

         # flag to build a dictionary table
         self._output_dictionary = (True if self.output_type in ('array', 'svec')
                                    else self.output_dictionary)

         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
             if self.distributed_by:
                 self._distributed_by = split_quoted_delimited_str(self.distributed_by.strip())
             else:
                 self._distributed_by = self._row_id_cols if self._row_id_cols else []
         else:
             self._distributed_by = []
     # -------------------------------------------------------------------------

     def _parse_dictlike_parameters(self):
         def _cast_validate_top(val_str):
             # each value of top can be either a float in (0.0, 1.0) or an integer
             try:
                 val = float(val_str)
                 _assert(0 < val,
                         "Encoding categorical error: top value should be positive")
                 if val >= 1:
                     # if val >= 1 then it should be input as a valid integer
                     # (e.g. 2 is valid but 2.0 is not)
                     _assert(val_str.isdigit(),
                             "Encoding categorical error: top value should "
                             "be an integer if greater than or equal to 1")
                     val = int(val_str)
             except ValueError:
                 plpy.error("Encoding categorical error: top value ({0})"
                            " is an invalid numeric value".format(val_str))
             return val

         if self.top:
             # top can be a mapping between col name and a value or ...
             out_param = extract_keyvalue_params(self.top,
                                                 allow_duplicates=False,
                                                 lower_case_names=False)
             if not out_param:
                 # ... a global value (without =) that applies to all columns
                 val = _cast_validate_top(self.top)
                 out_param = dict([(i, val) for i in self._output_cols])
             else:
                 for k in out_param:
                     out_param[k] = _cast_validate_top(out_param[k])
             self._top = out_param
         else:
             self._top = {}
         if self.value_to_drop:
             # value_to_drop can be a mapping between col name and a value or ...
             out_param = extract_keyvalue_params(self.value_to_drop, allow_duplicates=False)
             if not out_param:
                 # ... a global value (without =) that applies to all columns
                 out_param = dict([(i, str(self.value_to_drop)) for i in self._output_cols])
             self._value_to_drop = out_param
         else:
             self._value_to_drop = {}
     # -------------------------------------------------------------------------

     def _validate_parameters(self):
         input_tbl_valid(self.source_table, "Encoding categorical")
         output_tbl_valid(self.output_table, "Encoding categorical")
         output_tbl_valid(add_postfix(self.output_table, "_dictionary"), "Encoding categorical")
         if self._categorical_cols:
             _assert(is_var_valid(self.source_table, ','.join(self._categorical_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._categorical_cols, self.source_table))
         if self._row_id_cols:
             _assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._row_id_cols, self.source_table))
     # ------------------------------------------------------------------------------

     def _is_col_name_long(self, col_to_values):
         col_len = []
         for col, values in col_to_values.items():
             # Max col name length calculation:
             #       the name of column (col) +
             #       name of longest value in column (item) +
             #       underscore (1)
             values_len = []
             for v in values:
                 if v:
                     if not isinstance(v, (list, tuple)):
                         values_len.append(len(str(v)))
                     else:
                         values_len.append(len(self._name_others_col))
             col_len.append(len(col) + max(values_len) + 1)
         return max(col_len) > MAX_COLUMN_LENGTH
     # -------------------------------------------------------------------------

     def _get_quoted_unquoted(self, col_dict, col):
         """Special get function to check for quoted and unquoted col names
            in the given dictionary,
            since user input is not guaranteed to follow quote_ident rules

            It is assumed that 'col' is originally quoted - the end quotes
            are stripped to obtain the unquoted form.
         """
         return col_dict.get(col,
                             col_dict.get(
                                 quote_ident(col),
                                 col_dict.get(
                                     unquote_ident(col),
                                     None)))
     # -------------------------------------------------------------------------

     def _build_encoding_str(self, col_to_values):
         """ Build string to create categorical columns

         Returns:
            str. The string that goes into the select clause of a query to obtain
                 categorical column encodings
         """
         def _build_case_stmt(col, v, seq):
             """ Return a CASE statement that compares 'col' with the value v

             If v is a list then col is compared to be any one of the
             elements in v (with special handling for NULL value).
             """
             col_no_quotes = strip_end_quotes(col.strip())
             if isinstance(v, (list, tuple)):
                 # all values collected in a list are to be treated as a single
                 # categorical factor
                 if v:
                     non_null_v_str = ','.join(["'%s'" % (i) for i in v if i is not None])
                     if non_null_v_str:
                         value_str = "IN ({0})".format(non_null_v_str)
                         if None in v:
                             value_str += "OR {0} IS NULL".format(col)
                     else:
                         value_str = "IS NULL"
                 else:
                     return ''
                 v_type = list
             elif v is None:
                 value_str = "IS NULL"
                 v_type = None
             else:
                 # assume v is a string if not list/tuple and not None
                 value_str = "= '{v}'".format(v=str(v))
                 v_type = str

             if self.output_type not in ('array', 'svec'):
                 if not self._output_dictionary:
                     value_names = {None: 'NULL',
                                    list: self._name_others_col,
                                    str: strip_end_quotes(v)}
                     alias = 'AS "{0}_{1}"'.format(col_no_quotes, value_names[v_type])
                 else:
                     alias = 'AS "{0}_{1}"'.format(col_no_quotes, seq)
             else:
                 # if output_type is array-like then each case does not
                 # require an alias
                 alias = ""
             return ("(CASE WHEN ({col} {value_str}) "
                     "THEN 1 ELSE 0 END)::INTEGER {alias}".
                     format(col=col, value_str=value_str, alias=alias))

         self._output_dictionary = (self._output_dictionary or
                                    self._is_col_name_long(col_to_values))
         col_switch_list = []
         for col in self._output_cols:
             value_switch_list = [_build_case_stmt(col, v, i + 1)
                                  for i, v in enumerate(col_to_values[col])]
             # value_switch_list could have '' strings or empty lists which
             # need to be filtered out before adding to the case switch list
             col_switch_list.append(','.join([i for i in value_switch_list if i]))
         return ',\n'.join(col_switch_list)

     # ----------------------------------------------------------------------

     def _build_output_dictionary(self, col_to_values):
         """ Create a mapping between column names in output table and their
         corresponding meaning"""

         def _get_value_name(v):
             if v is None:
                 return "NULL"
             elif isinstance(v, (list, tuple)):
                 return ",".join([_get_value_name(i) for i in v])
             else:
                 return str(v)

         dict_tbl_name = add_postfix(self.output_table, "_dictionary")
         plpy.execute("""
             CREATE TABLE {tbl} (
                 encoded_column_name TEXT,
                 index               INTEGER,
                 variable            TEXT,
                 value               TEXT
             )
         """.format(tbl=dict_tbl_name))
         global_seq = count(1)
         for col in self._output_cols:
             values = col_to_values[col]
             local_seq = count(1)
             col_no_quotes = strip_end_quotes(col)
             if self.output_type != 'column':
                 encoded_col_name = "__encoded_variables__"
                 seq = global_seq
             else:
                 encoded_col_name = '"{col_no_quotes}_{seq}"'
                 seq = local_seq
             insert_template = "('%s', {seq}, '{col}', '{value_str}'::TEXT)" % (encoded_col_name)
             insert_values = [insert_template.
                              format(col=col,
                                     col_no_quotes=col_no_quotes,
                                     seq=next(seq),
                                     value_str=_get_value_name(v))
                              for v in values]
             plpy.execute("""INSERT INTO {tbl} VALUES {insert_str} """.
                          format(tbl=dict_tbl_name,
                                 insert_str=',\n'.join(insert_values)))
     # ----------------------------------------------------------------------

     def _find_cat_features(self):
         self._all_cols_types = get_cols_and_types(self.source_table)

         # any column belonging to the following types are considered categorical
         int_types = ['integer', 'smallint']
         text_types = ['text', 'varchar', 'character varying', 'char', 'character']
         boolean_types = ['boolean']
         self._cat_types = set(int_types + text_types + boolean_types)

         self._cat_features = [c for (c, t) in self._all_cols_types
                               if t in self._cat_types]
     # -------------------------------------------------------------------------

     def _get_cols_to_encode(self):
         """ Expand '*' syntax and exclude some categorical columns

         We also exclude from row_id columns
         """
         self._find_cat_features()

         # include the quoted name and the unquoted name in exclusion set
         # to allow user to provide either form
         exclude_set = set(self._categorical_cols_to_exclude + self._row_id_cols)
         exclude_set |= set(quote_ident(i)
                            for i in self._categorical_cols_to_exclude + self._row_id_cols)

         if not self._categorical_cols:
             features = [f for f in self._cat_features if f not in exclude_set]
             col_to_type = dict([(c, get_expr_type(c, self.source_table))
                                 for c in features])
         else:
             col_to_type = {}
             features = []
             ignored_cols = []
             for col in self._categorical_cols:
                 col_type = get_expr_type(col, self.source_table).lower()
                 if col_type in self._cat_types:
                     features.append(col)
                     col_to_type[col] = col_type
                 else:
                     ignored_cols.append(col)
             if ignored_cols:
                 plpy.warning("Encoding categorical: Ignoring non-categorical columns ({0})".
                              format(','.join(ignored_cols)))
         return features, col_to_type
     # -------------------------------------------------------------------------

     def _get_top_values(self):
         """ Get the top values for each column.

         Note: this function computes the frequencies for values of each column
         even if that column is not part of the 'top' argument. An improvement
         here is to compute top values only for requested columns and use
         distinct values for others.
         """
         def _cum_sum(values, start=0):
             for v in values:
                 start += v
                 yield start

         top_val_sql_list = []
         for col in self._output_cols:
             if not self.encode_null:
                 filter_str = 'WHERE {col} IS NOT NULL'.format(col=col)
             else:
                 filter_str = ''
             # get value distribution for each column independently
             top_val_sql_list.append("""
                 SELECT
                     '{col}' as col_name,
                     array_agg(f order by c desc) as value,
                     array_agg(c order by c desc) as freq
                 FROM (
                     SELECT {col}::text as f, count(*)::integer as c
                     FROM {tbl}
                     {filter_str}
                     GROUP BY {col}
                ) q
             """.format(col=col, tbl=self.source_table, filter_str=filter_str))
         top_values = plpy.execute('\n UNION ALL \n'.join(top_val_sql_list))

         # top_values is now a list of dictionary, each element
         # giving the frequency of the values in a column
         top_distinct_values = {}
         for each_col_data in top_values:
             col, ordered_values, ordered_freq = [each_col_data[i]
                                                  for i in ('col_name', 'value', 'freq')]
             # drop reference from distinct values
             if (col in self._value_to_drop and
                     self._value_to_drop[col] in ordered_values):
                 drop_index = ordered_values.index(self._value_to_drop[col])
                 ordered_values.pop(drop_index)
                 ordered_freq.pop(drop_index)

             if not ordered_values:
                 plpy.error("Encoding categorical error: "
                            "No top values found for {0} or "
                            "all values dropped as per function arguments"
                            "(value_to_drop, encode_null)".format(col))

             if col in self._top:
                 # for each column find at which point does the top 'k' values occur
                 top_arg = self._top[col]
                 if top_arg >= 1:
                     # top >= 1 is considered a integer count of values to pick ...
                     k = int(top_arg)
                 else:
                     # ..., top < 1 is considered a percent of total count
                     threshold = int(math.ceil(top_arg * sum(ordered_freq)))
                     cum_freq = list(_cum_sum(ordered_freq))
                     k = bisect(cum_freq, threshold, 0, len(ordered_values)-2) + 1

                 top_distinct_values[col] = ordered_values[:k]
                 if k < len(ordered_values):
                     # putting this into an if check avoids empty list
                     # at the end when k >= actual number of values
                     top_distinct_values[col] += [list(ordered_values[k:])]
             else:
                 # need all values for this column since no top provided
                 top_distinct_values[col] = ordered_values
         return top_distinct_values
     # -------------------------------------------------------------------------

     def _get_distinct_values(self):
         """ Find distinct values of each categorical column
         """
         array_agg_str = ',\n'.join('array_agg(DISTINCT {c}) AS {c_quoted}'.
                                    format(c=c, c_quoted=quote_ident(c))
                                    for c in self._output_cols)

         if self.encode_null:
             # Some platforms don't include NULL values as part of the
             # array_agg(DISTINCT ...). Below checks explicitly for NULL values
             null_str = ', ' + ',\n'.join(
                 'bool_or(CASE WHEN {c} IS NULL THEN True ELSE False END)'
                 ' AS "{c_}_isnull"'.format(c=c, c_=strip_end_quotes(c.strip()))
                 for c in self._output_cols)
         else:
             null_str = ''
         col_values_data = plpy.execute("SELECT {0} {1} FROM {2}".
                                        format(array_agg_str,
                                               null_str,
                                               self.source_table))[0]

         # Collect the distinct values (possibly including null) for every column
         distinct_values = {}
         for col in self._output_cols:
             dv = [str(i) if i is not None else i for i in col_values_data[col]]
             # drop reference from distinct values
             if (col in self._value_to_drop and self._value_to_drop[col] in dv):
                 dv.remove(self._value_to_drop[col])

             # check for NULL values
             if not self.encode_null:
                 if None in dv:
                     # ignore NULL if encode_null = False
                     dv.remove(None)
             else:
                 null_col_name = '"{c}_isnull"'.format(c=strip_end_quotes(col.strip()))
                 col_contains_null = self._get_quoted_unquoted(col_values_data, null_col_name)
                 if col_contains_null and None not in dv:
                     dv.append(None)

             if not dv:
                 plpy.error("Encoding categorical error: "
                            "No distinct values found for {0} or "
                            "all distinct values dropped as per function arguments"
                            "(value_to_drop, encode_null)".format(col))

             distinct_values[col] = dv
         return distinct_values
 # ------------------------------------------------------------------------------


 def encode_categorical_variables(
         schema_madlib, source_table, output_table, categorical_cols,
         categorical_cols_to_exclude=None,
         row_id=None,
         top=None,
         value_to_drop=None,
         encode_null=False,
         output_type='column',
         output_dictionary=False,
         distributed_by=None,
         **kwargs):
     """
     Main function to encode categorical variables
     Args:
         @param source_table:str, Name of table containing categorical variable
         @param output_table:str, Name of table to output dummy variables
         @param categorical_cols:str, Comma-separated list of column names to dummy code (can be '*')
         @param categorical_cols_to_exclude:str, Comma-separated list of column names to exclude (if categorical_cols = '*')
         @param row_id: str, Columns from source table to index output table
         @param top: str, Parameter to include only top values of a categorical variable
         @param value_to_drop: str, Parameter to set reference column in dummy coding
         @param encode_null: bool, If True, NULL is treated as a categorical value
         @param output_type: str, Parameter to determine if output should be an array, svec or individual columns
                                  Can take values ('column', 'array', 'svec')
         @param output_dictionary: bool, If True columns names are simplified and
                     a separate mapping table is created to understand the names
         @param distributed_by: str, Comma-separated list of column names to use for distribution of output
     """
     with MinWarning('warning'):
         encoder = CategoricalEncoder(schema_madlib, source_table, output_table,
                                      categorical_cols, categorical_cols_to_exclude,
                                      row_id, top, value_to_drop, encode_null,
                                      output_type, output_dictionary,
                                      distributed_by)
         encoder.build_output_table()
     return None
 # ---------------------------------------------------------------


 def encode_categorical_help(schema_madlib, message, **kwargs):
     """
     Help function for encode_categorical_variables

     Args:
         @param schema_madlib
         @param message: string, Help message string
         @param kwargs

     Returns:
         String. Help/usage information
     """
     if not message:
         help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Categorical variables [1] require special attention in regression analysis
 because, unlike dichotomous or continuous variables, they cannot be entered into
 the regression equation just as they are.  For example, if you have a variable
 called race that is coded with 1=Hispanic, 2=Asian, 3=Black, 4=White, then
 entering race in your regression will look at the linear effect of the race
 variable, which is probably not what you intended. Instead, categorical
 variables like this need to be coded into a series of indicator variables which
 can then be entered into the regression model.  There are a variety of coding
 systems that can be used for coding categorical variables, including one-hot,
 dummy, effects, orthogonal, and Helmert.

 We currently support one-hot and dummy coding techniques.

 Dummy coding is used when a researcher wants to compare other groups of the
 predictor variable with one specific group of the predictor variable.
 Often, the specific group to compare with is called the reference group.

 One-hot encoding is similar to dummy coding except it builds indicator (0/1)
 columns (cast as numeric) for each value of each category.
 Only one of these columns could take on the value 1 for each row (data point).
 There is no reference category for this function.

 For more details on function usage:
     SELECT {madlib}.encode_categorical_variables('usage')
             """
     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
 SELECT {madlib}.encode_categorical_variables (
         source_table,                   -- Name of source table
         output_table,                   -- Name of table to output encoded data
         categorical_cols,               -- Comma-separated list of columns to encode
                                         --  (can be *)
         categorical_cols_to_exclude,    -- (Optional) Columns to exclude if using '*' above
         row_id,                         -- (Optional) Columns corresponding to
                                         --  primary keys of source table
         top,                            -- (Optional) Parameter to encode only top values
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
         output_type,                    -- (Optional) Get encoded variables in individual columns
                                         --    or as an array (Can be 'column', 'array', or 'svec')
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of
                                         --   the output table (does not apply for Postgresql)
         )

 Refer to online documentation for details on above parameters.
 -----------------------------------------------------------------------
                            OUTPUT
 -----------------------------------------------------------------------
     If there are index columns in the 'source_table' specified by the parameter
     'row_id' (see below), then the output table will contain only the index
     columns 'row_id' and the encoded columns.

     If the parameter 'row_id' is not specified, then all columns from the
     'source_table', with the exception of the original categorical columns,
     will be included in the 'output_table'.

 """
     else:
         help_string = """No such option.

 For more details on function usage:
     SELECT {madlib}.encode_categorical_variables('usage')
 """
     return help_string.format(madlib=schema_madlib)
 # ---------------------------------------------------------------------