src/ports/postgres/modules/regress/logistic.py_in - madlib - Git at Google

 # coding=utf-8

 """
 @file logistic.py_in

 @brief Logistic Regression: Driver functions

 @namespace logistic

 @brief Logistic Regression: Driver functions
 """
 import plpy
 from utilities.group_control import GroupIterationController
 from utilities.utilities import unique_string
 from utilities.validate_args import table_exists
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import explicit_bool_to_text
 from utilities.utilities import _string_to_array
 from utilities.utilities import __mad_version
 from utilities.utilities import add_postfix
 from utilities.utilities import _string_to_array_with_quotes

 from collections import defaultdict

 version_wrapper = __mad_version()
 # ========================================================================


 def __compute_logregr(schema_madlib, rel_args, rel_state, rel_source,
                       dep_col, ind_col, optimizer, grouping_col,
                       grouping_str, col_grp_iteration, col_grp_state, **kwargs):
     """
     Compute logistic regression coefficients

     This method serves as an interface to different optimization algorithms.
     By default, iteratively reweighted least squares is used, but for data with
     a lot of columns the conjugate-gradient method might perform better.

     @param schema_madlib Name of the MADlib schema, properly escaped/quoted
     @param rel_args Stores parameters that are needed by optimizers:
                         max_iter and tolerance
     @param rel_state Store the iteration states
     @param rel_source Name of relation containing the training data
     @param dep_col Name of dependent column in training data (of type BOOLEAN)
     @param ind_col Name of independent column in training data (of type
                    DOUBLE PRECISION[])
     @param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
                      reweighted least squares, 'cg': conjugate gradient or
                      'igd': incremental gradient descent
     @param grouping_col String of comma delimited group-by columns
     @param grouping_str string of comma delimited group-by columns (casted)
     @param kwargs We allow the caller to specify additional arguments (all of
            which will be ignored though). The purpose of this is to allow the
            caller to unpack a dictionary whose element set is a superset of
            the required arguments by this function.

     @return Number of iterations that has been run
     """

     iterationCtrl = GroupIterationController(
         rel_args=rel_args,
         rel_state=rel_state,
         stateType="double precision[]",
         # truncAfterIteration=False,
         schema_madlib=schema_madlib,  # Identifiers start here
         rel_source=rel_source,
         ind_col=ind_col,
         dep_col=dep_col,
         optimizer=optimizer,
         grouping_col=grouping_col,
         grouping_str=grouping_str,
         col_grp_iteration=col_grp_iteration,
         col_grp_state=col_grp_state)

     with iterationCtrl as it:
         it.iteration = 0
         while True:
             it.update(
                 """
                 {schema_madlib}.__logregr_{optimizer}_step(
                     ({dep_col})::boolean,
                     ({ind_col})::double precision[],
                     rel_state.{col_grp_state})
                 """)
             if it.test(
                     """
                     {iteration} >= _args.max_iter
                     or
                     _state_current[array_upper(_state_current, 1)] = 3
                     or
                     {schema_madlib}.__logregr_{optimizer}_step_distance(
                         _state_previous, _state_current) < _args.tolerance
                     """):
                 break
     return iterationCtrl.iteration

 # ========================================================================


 def logregr_train(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, grouping_cols=None, max_iter=None,
         optimizer=None, tolerance=None, verbose=None, **kwargs):
     """
     Train logistic model

     @param schema_madlib Name of the MADlib schema, properly escaped/quoted
     @param source_table Name of relation containing the training data
     @param out_table Name of relation where model will be outputted
     @param dependent_varname Name of dependent column in training data (of type BOOLEAN)
     @param independent_varname Name of independent column in training data (of type
                    DOUBLE PRECISION[])
     @param grouping_cols String of comma delimited group-by columns
     @param max_iter The maximum number of iterations that are allowed.
     @param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
                      reweighted least squares, 'cg': conjugate gradient or 'igd':
                      incremental gradient descent
     @param tolerance The precision that the results should have
     @param kwargs We allow the caller to specify additional arguments (all of
            which will be ignored though). The purpose of this is to allow the
            caller to unpack a dictionary whose element set is a superset of
            the required arguments by this function.

     @return A composite value which is __logregr_result defined in logistic.sql_in
     """
     optimizer = __logregr_validate_args(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, grouping_cols, max_iter, optimizer, tolerance)

     return __logregr_train_compute(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, grouping_cols, max_iter, optimizer, tolerance,
         verbose, **kwargs)

 # ========================================================================


 def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
                             ind_col, grouping_col, max_iter, optimizer,
                             tolerance):
     """
     Validate the arguments
     """
     if tbl_source is None or tbl_source.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid data table name!")
     if not table_exists(tbl_source):
         plpy.error("Logregr error: Data table does not exist!")
     if table_is_empty(tbl_source):
         plpy.error("Logregr error: Data table is empty!")

     if tbl_output is None or tbl_output.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid output table name!")

     if (table_exists(tbl_output, only_first_schema=True)):
         plpy.error("Output table name already exists. Drop the table before "
                    "calling the function.")

     if not dep_col or dep_col.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid dependent column name!")

     # if not columns_exist_in_table(tbl_source, [dep_col]):
     #     plpy.error("Logregr error: Dependent column does not exist!")

     if not ind_col or ind_col.lower() in ('null', ''):
         plpy.error("Logregr error: Invalid independent column name!")

     if grouping_col is not None:
         if grouping_col == '':
             plpy.error("Logregr error: Invalid grouping columns name!")
         if not columns_exist_in_table(
                 tbl_source, _string_to_array(grouping_col), schema_madlib):
             plpy.error("Logregr error: Grouping column does not exist!")

         intersect = frozenset(_string_to_array(grouping_col)).intersection(
             frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
                        'p_values', 'odds_ratios', 'condition_no',
                        'num_processed', 'num_missing_rows_skipped',
                        'variance_covariance')))
         if len(intersect) > 0:
             plpy.error("Logregr error: Conflicted grouping column name.\n"
                        "Predefined name(s) {0} are not allow!".format(
                            ', '.join(intersect)))

     if max_iter <= 0:
         plpy.error("Logregr error: Maximum number of iterations must be positive!")

     if tolerance < 0:
         plpy.error("Logregr error: The tolerance cannot be negative!")

     if optimizer == "newton":
         optimizer = "irls"
     elif optimizer not in ("irls", "cg", "igd"):
         plpy.error(""" Logregr error: Unknown optimizer requested.
                    Must be 'newton'/'irls', 'cg', or 'igd'.
                    """)

     return optimizer

 # ========================================================================


 def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                             ind_col, grouping_col, max_iter, optimizer,
                             tolerance, verbose, **kwargs):
     """
     Create an output table (drop if exists) that contains the logistic
     regression model
     """
     old_msg_level = plpy.execute("select setting from pg_settings where \
                                   name='client_min_messages'")[0]['setting']
     if verbose:
         plpy.execute("set client_min_messages to warning")
     else:
         plpy.execute("set client_min_messages to error")

     args = {'schema_madlib': schema_madlib,
             'tbl_source': tbl_source,
             'tbl_output': tbl_output,
             'dep_col': dep_col,
             'ind_col': ind_col,
             'max_iter': max_iter,
             'optimizer': optimizer,
             'tolerance': tolerance,
             'tbl_logregr_args': unique_string(),
             'tbl_logregr_state': unique_string(),
             'col_grp_iteration': unique_string(),
             'col_grp_state': unique_string(),
             'irls': "__logregr_irls_result",
             'newton': "__logregr_irls_result",
             'cg': "__logregr_cg_result",
             'igd': "__logregr_igd_result"}

     plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".
                  format(**args))
     plpy.execute("""
         CREATE TABLE pg_temp.{tbl_logregr_args} as
             SELECT
                 {max_iter} as max_iter,
                 {tolerance} as tolerance
         """.format(**args))

     # return an array of dict
     # each dict has two elements: iteration number, and grouping value array
     if grouping_col:
         grouping_list = explicit_bool_to_text(
             tbl_source, _string_to_array_with_quotes(grouping_col), schema_madlib)
         for i in range(len(grouping_list)):
             grouping_list[i] += "::text"
         grouping_str = ','.join(grouping_list)
     else:
         grouping_str = "Null"

     iteration_run = __compute_logregr(schema_madlib, args["tbl_logregr_args"],
                                       args["tbl_logregr_state"], tbl_source,
                                       dep_col, ind_col, optimizer,
                                       grouping_col=grouping_col,
                                       grouping_str=grouping_str,
                                       col_grp_iteration=args[
                                           "col_grp_iteration"],
                                       col_grp_state=args["col_grp_state"])

     grouping_str1 = "" if grouping_col is None else grouping_col + ","
     grouping_str2 = "1 = 1" if grouping_col is None else grouping_col
     using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
     join_str = "," if grouping_str1 == "" else "join "

     plpy.execute("""
         CREATE TABLE {tbl_output} as
             SELECT
                 {grouping_str1}
                 (CASE WHEN (result).status = 1 THEN (result).coef
                     ELSE NULL::double precision[] END) as coef,
                 (CASE WHEN (result).status = 1 THEN (result).log_likelihood
                     ELSE NULL::double precision END) as log_likelihood,
                 (CASE WHEN (result).status = 1 THEN (result).std_err
                     ELSE NULL::double precision[] END) as std_err,
                 (CASE WHEN (result).status = 1 THEN (result).z_stats
                     ELSE NULL::double precision[] END) as z_stats,
                 (CASE WHEN (result).status = 1 THEN (result).p_values
                     ELSE NULL::double precision[] END) as p_values,
                 (CASE WHEN (result).status = 1 THEN (result).odds_ratios
                     ELSE NULL::double precision[] END) as odds_ratios,
                 (CASE WHEN (result).status = 1 THEN (result).condition_no
                     ELSE NULL::double precision END) as condition_no,
                 (CASE WHEN (result).status = 1 THEN (result).num_processed
                     when result is NULL THEN 0
                     ELSE NULL::bigint end) AS num_rows_processed,
                 (CASE WHEN (result).status = 1 THEN num_rows - (result).num_processed
                     when result is null THEN num_rows
                     ELSE NULL::bigint end) AS num_missing_rows_skipped,
                 {col_grp_iteration} as num_iterations,
                 (CASE WHEN (result).status = 1 THEN (result).vcov
                     ELSE NULL::double precision[] END) as variance_covariance
             FROM
             (
                 SELECT {col_grp_iteration}, {grouping_str1} result, num_rows
                 FROM (
                     (SELECT
                         {grouping_str1}
                         {schema_madlib}.{fnName}({col_grp_state}) as result,
                         {col_grp_iteration}
                     FROM
                         {tbl_logregr_state}
                     ) t
                     JOIN
                     (SELECT
                         {grouping_str1}
                         max({col_grp_iteration}) as {col_grp_iteration}
                     FROM {tbl_logregr_state}
                     GROUP BY {grouping_str2}
                     ) s
                     USING ({grouping_str1} {col_grp_iteration})
                 ) q1
                 {join_str}
                 (
                     SELECT
                         {grouping_str1}
                         count(*) num_rows
                     FROM {tbl_source}
                     GROUP BY {grouping_str2}
                 ) q2
                 {using_str}
             ) q3
         """.format(grouping_str1=grouping_str1,
                    grouping_str2=grouping_str2,
                    fnName=args[args["optimizer"]],
                    iteration_run=iteration_run,
                    using_str=using_str,
                    join_str=join_str,
                    **args))

     failed_groups = plpy.execute("""
         SELECT count(*) as count
         FROM {tbl_output}
         WHERE coef IS NULL
         """.format(tbl_output=tbl_output))[0]["count"]
     all_groups = plpy.execute("""
         SELECT count(*) AS count
         FROM {tbl_output}
         """.format(**args))[0]["count"]

     # JIRA: MADLIB-1172: When no model can be generated due to ill-conditioned
     # input data, the output table dosen't get populated. In That
     # case, report back an error instead of a successful empty table.
     if failed_groups == all_groups or all_groups == 0:
         plpy.execute("DROP TABLE IF EXISTS " + str(tbl_output))
         plpy.error("Logregr error: No model created possibly "
                    "due to ill-conditioned data.")

     num_rows = plpy.execute("""
         SELECT
             sum(num_rows_processed) AS num_rows_processed,
             sum(num_missing_rows_skipped) AS num_missing_rows_skipped
         FROM {tbl_output}
         """.format(tbl_output=tbl_output))[0]

     if num_rows['num_rows_processed'] is None:
         num_rows['num_rows_processed'] = "NULL"
         num_rows['num_missing_rows_skipped'] = "NULL"

     args.update(num_rows)

     tbl_output_summary = add_postfix(tbl_output, "_summary")
     plpy.execute(
         """
         create table {tbl_output_summary} as
             select
                 'logregr'::varchar                  as method,
                 '{tbl_source}'::varchar             as source_table,
                 '{tbl_output}'::varchar             as out_table,
                 '{dep_col}'::varchar                as dependent_varname,
                 '{ind_col}'::varchar                as independent_varname,
                 'optimizer={optimizer}, max_iter={max_iter}, tolerance={tolerance}'::varchar as optimizer_params,
                 {all_groups}::integer as num_all_groups,
                 {failed_groups}::integer            as num_failed_groups,
                 {num_rows_processed}::integer       as num_rows_processed,
                 {num_missing_rows_skipped}::integer as num_missing_rows_skipped,
                 {grouping_col}::text              as grouping_col
         """.format(all_groups=all_groups,
                    failed_groups=failed_groups, tbl_output_summary=tbl_output_summary,
                    grouping_col="'" + grouping_col + "'" if grouping_col else "NULL",
                    **args))

     plpy.execute("""
                  DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
                  DROP TABLE IF EXISTS pg_temp.{tbl_logregr_state}
                  """.format(**args))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None
 # --------------------------------------------------------------------


 def logregr_help_msg(schema_madlib, message, **kwargs):
     """ Help message for logistic regression

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message:

         help_string = """
 ----------------------------------------------------------------
                         SUMMARY
 ----------------------------------------------------------------
 Binomial logistic regression models the relationship between a
 dichotomous dependent variable and one or more predictor variables.

 The dependent variable may be a Boolean value or a categorical variable
 that can be represented with a Boolean expression.

 For more details on function usage:
     SELECT {schema_madlib}.logregr_train('usage')
         """
     elif message in ['usage', 'help', '?']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.logregr_train(
     source_table,         -- name of input table
     out_table,            -- name of output table
     dependent_varname,    -- name of dependent variable
     independent_varname,  -- names of independent variables
     grouping_cols,        -- optional, default NULL, names of columns to group-by
     max_iter,             -- optional, default 20, maximum iteration number
     optimizer,            -- optional, default 'irls', name of optimization method
     tolerance,            -- optional, default 0.0001, the stopping threshold
     verbose               -- optional, default FALSE, whether to print useful info
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output table ('out_table' above) has the following columns:
     <...>,                                          -- Grouping column values used during training
     'coef',                     double precision[], -- vector of fitting coefficients
     'log_likelihood',           double precision,   -- log likelihood
     'std_err',                  double precision[], -- vector of standard errors of the fitting coefficients
     'z_stats',                  double precision[], -- vector of the z-statistics of the coefficients
     'p_values',                 double precision[], -- vector of the p values
     'odds_ratios',              double precision[], -- vector of odds ratios, exp(coefficients)
     'condition_no',             double precision,   -- the condition number
     'num_rows_processed',       integer,            -- how many rows are actually used in the computation
     'num_missing_rows_skipped', integer,            -- number of rows that contain NULL and were skipped per group
     'num_iterations'            double precision    -- how many iterations are used in the computation per group

 A summary table named <out_table>_summary is also created at the same time, which has:
     'method'                    varchar,    -- modeling method name ('logregr')
     'source_table'              varchar,    -- the data source table name
     'out_table'                 varchar,    -- the output table name
     'dependent_varname'         varchar,    -- the dependent variable
     'independent_varname'       varchar,    -- the independent variable
     'optimizer_params'          varchar,    -- 'optimizer=..., max_iter=..., tolerance=...'
     'num_all_groups'            integer,    -- how many groups
     'num_failed_groups'         integer,    -- how many groups' fitting processes failed
     'num_rows_processed'        integer,    -- total number of rows used in the computation
     'num_missing_rows_skipped'  integer,    -- total number of rows skipped
     'grouping_col'              varchar     -- grouping columns used in the regression
         """
     else:
         help_string = "No such option. Use {schema_madlib}.logregr_train('help')"

     return help_string.format(schema_madlib=schema_madlib)
 # ========================================================================


 def logregr_predict_help(schema_madlib, message, **kwargs):
     """ Help message for logistic regression prediction

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message:
         message = "summary"
     elif message.lower() in ['usage', 'help', '?']:
         message = 'usage'
     elif message.lower() in ['example', 'examples']:
         message = 'example'
     else:
         message = 'unknown'

     help_string = defaultdict(str)
     help_string['summary'] = """
 ----------------------------------------------------------------
                         SUMMARY
 ----------------------------------------------------------------
 Prediction for logistic regression can be used to obtain a prediction of the
 boolean value of the dependent variable given a value of independent variable.

 For more details on function usage:
     SELECT {schema_madlib}.logregr_predict('usage')

 For a small example on using the function:
     SELECT {schema_madlib}.logregr_predict('example')
         """

     help_string['usage'] = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 logregr_predict(
     coef,        -- DOUBLE PRECISION[], Coefficient of logistic regression model
     col_ind_var  -- DOUBLE PRECISION[], Values for the independent variables
 )

 The lengths of 'coef' array and 'col_ind_var' array above should be equal. For
 a small example on using the function:
     SELECT {schema_madlib}.logregr_predict('example')
 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output of the function is a BOOLEAN value representing the predicted
 dependent variable value for the inputed independent variables.
         """

     help_string['example'] = """
 The tables below are obtained from the example in 'logregr_train'.
 Details can be found by running "SELECT logregr_train('examples');"

 -- We compare the prediction with the actual value 'second_attack'
 SELECT id,
        madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]) as pred_value,
        second_attack as orig_value
 FROM patients p, patients_logregr m
 ORDER BY id;
         """

     help_string['unknown'] = "No such option. Use {schema_madlib}.logregr_predict()"

     return help_string[message].format(schema_madlib=schema_madlib)
 # -------------------------------------------------------------------------


 def logregr_predict_prob_help(schema_madlib, message, **kwargs):
     """ Help message for logistic regression probability prediction

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message or message == 'summary':
         message = "summary"
     elif message.lower() in ['usage', 'help', '?']:
         message = 'usage'
     elif message.lower() in ['example', 'examples']:
         message = 'example'
     else:
         message = 'unknown'

     help_string = defaultdict(str)
     help_string['summary'] = """
 ----------------------------------------------------------------
                         SUMMARY
 ----------------------------------------------------------------
 This function can be used to obtain a prediction of the probability of
 the dependent variable in a logistic regression model given the value of
 independent variables.

 For more details on function usage:
     SELECT {schema_madlib}.logregr_predict_prob('usage')

 For a small example on using the function:
     SELECT {schema_madlib}.logregr_predict_prob('example')
         """

     help_string['usage'] = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 logregr_predict_prob(
     coef,        -- DOUBLE PRECISION[], Coefficient of logistic regression model
     col_ind_var  -- DOUBLE PRECISION[], Values for the independent variables
 )

 The lengths of 'coef' array and 'col_ind_var' array above should be equal. For
 a small example on using the function:
     SELECT {schema_madlib}.logregr_predict_prob('example')
 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output of the function is a DOUBLE PRECISION value representing the
 probability of predicted dependent variable value being TRUE for the inputed
 independent variable values.
         """

     help_string['example'] = """
 -- The tables below are obtained from the example in 'logregr_train'.
 -- Details can be found by running "SELECT logregr_train('examples');"

 SELECT id,
        madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]) as pred_prob
 FROM patients p, patients_logregr m
 ORDER BY id;
         """

     help_string['unknown'] = "No such option. Use {schema_madlib}.logregr_predict_prob()"

     return help_string[message].format(schema_madlib=schema_madlib)