src/ports/postgres/modules/regress/multilogistic.py_in - madlib - Git at Google

 # coding=utf-8

 """
 @file multilogistic.py_in

 @brief Multinomial Logistic Regression: Driver functions

 @namespace multilogistic

 Multinomial Logistic Regression: Driver functions
 """

 import plpy
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
 from utilities.utilities import preprocess_optimizer_params
 from utilities.utilities import _assert


 def __runIterativeAlg(stateType, initialState, source, updateExpr,
                       terminateExpr, max_num_iterations,
                       cyclesPerIteration=1):
     """
     Driver for an iterative algorithm

     A general driver function for most iterative algorithms: The state between
     iterations is kept in a variable of type <tt>stateType</tt>, which is
     initialized with <tt><em>initialState</em></tt>. During each iteration, the
     SQL statement <tt>updateSQL</tt> is executed in the database. Afterwards,
     the SQL query <tt>updateSQL</tt> decides whether the algorithm terminates.

     @param stateType SQL type of the state between iterations
     @param initialState The initial value of the SQL state variable
     @param source The source relation
     @param updateExpr SQL expression that returns the new state of type
         <tt>stateType</tt>. The expression may use the replacement fields
         <tt>"{state}"</tt>, <tt>"{iteration}"</tt>, and
         <tt>"{sourceAlias}"</tt>. Source alias is an alias for the source
         relation <tt><em>source</em></tt>.
     @param terminateExpr SQL expression that returns whether the algorithm should
         terminate. The expression may use the replacement fields
         <tt>"{oldState}"</tt>, <tt>"{newState}"</tt>, and
         <tt>"{iteration}"</tt>. It must return a BOOLEAN value.
     @param max_num_iterations Maximum number of iterations. Algorithm will then
         terminate even when <tt>terminateExpr</tt> does not evaluate to \c true
     @param cyclesPerIteration Number of aggregate function calls per iteration.
     """

     updateSQL = """
         INSERT INTO _madlib_iterative_alg
         SELECT
             {{iteration}},
             {updateExpr}
         FROM
             _madlib_iterative_alg AS st,
             {{source}} AS src
         WHERE
             st._madlib_iteration = {{iteration}} - 1
         """.format(updateExpr=updateExpr)
     terminateSQL = """
         SELECT
             {terminateExpr} AS should_terminate
         FROM
         (
             SELECT _madlib_state
             FROM _madlib_iterative_alg
             WHERE _madlib_iteration = {{iteration}} - {{cyclesPerIteration}}
         ) AS older,
         (
             SELECT _madlib_state
             FROM _madlib_iterative_alg
             WHERE _madlib_iteration = {{iteration}}
         ) AS newer
         """.format(terminateExpr=terminateExpr)
     checkForNullStateSQL = """
         SELECT _madlib_state IS NULL AS should_terminate
         FROM _madlib_iterative_alg
         WHERE _madlib_iteration = {iteration}
         """
     oldMsgLevel = plpy.execute("""SELECT setting
                                   FROM pg_settings
                                   WHERE name='client_min_messages'"""
                                )[0]['setting']

     plpy.execute("""
         SET client_min_messages = error;
         DROP TABLE IF EXISTS _madlib_iterative_alg;
         CREATE TEMPORARY TABLE _madlib_iterative_alg (
             _madlib_iteration INTEGER PRIMARY KEY,
             _madlib_state {stateType}
         );
         SET client_min_messages = {oldMsgLevel};
         """.format(stateType=stateType,
                    oldMsgLevel=oldMsgLevel))

     iteration = 0
     plpy.execute("""
         INSERT INTO _madlib_iterative_alg VALUES ({iteration}, {initialState})
         """.format(iteration=iteration, initialState=initialState))
     while True:
         iteration = iteration + 1
         plpy.execute(updateSQL.format(
             source=source,
             state="(st._madlib_state)",
             iteration=iteration,
             sourceAlias="src"))
         if (
             plpy.execute(checkForNullStateSQL.
                          format(iteration=iteration))[0]['should_terminate'] or
             (iteration > cyclesPerIteration and
              (iteration >= cyclesPerIteration * max_num_iterations or
                 plpy.execute(terminateSQL.format(
                     iteration=iteration,
                     cyclesPerIteration=cyclesPerIteration,
                     oldState="(older._madlib_state)",
                     newState="(newer._madlib_state)")
                 )[0]['should_terminate']))):
             break

     # Note: We do not drop the temporary table
     return iteration


 def compute_mlogregr(schema_madlib, source_table, dependent_varname,
                      independent_varname, num_categories,
                      max_iter, optimizer,
                      precision, ref_category, **kwargs):
     """
     Compute logistic regression coefficients

     This method serves as an interface to different optimization algorithms.
     By default, iteratively reweighted least squares is used, but for data with
     a lot of columns the conjugate-gradient method might perform better.

     @param schema_madlib Name of the MADlib schema, properly escaped/quoted
     @param source_table Name of relation containing the training data
     @param dependent_varname Name of dependent column in training data (of type INTEGER)
     @param num_categories Number of categories in the multilogistic regression
     @param independent_varname Name of independent column in training data (of type
            DOUBLE PRECISION[])
     @param optimizer Name of the optimizer. 'newton' or 'irls'
     @param max_iter Maximum number of iterations
     @param precision Terminate if two consecutive iterations have a difference
            in the log-likelihood of less than <tt>precision</tt>. In other
            words, we terminate if the objective function value has converged.
            This convergence criterion can be disabled by specifying a negative
            value.
     @param ref_category The user-specified reference category
     @param kwargs We allow the caller to specify additional arguments (all of
            which will be ignored though). The purpose of this is to allow the
            caller to unpack a dictionary whose element set is a superset of
            the required arguments by this function.

     @return array with coefficients in case of convergence, otherwise None
     """

     if max_iter < 1:
         plpy.error("Mlogregr error: Number of iterations must be positive")

     if optimizer == 'newton':
         optimizer = 'irls'
     elif optimizer not in ['irls']:
         plpy.error("Mlogregr error: Unknown optimizer requested."
                    " Must be 'newton' or 'irls'")

     return __runIterativeAlg(
         stateType="FLOAT8[]",
         initialState="NULL",
         source=source_table,
         updateExpr="""
             {schema_madlib}.__mlogregr_{optimizer}_step(
                 ({depvar}),
                 ({num_categories}),
                 ({ref_category}),
                 ({indepvar})::FLOAT8[],
                 {{state}}
             )
             """.format(schema_madlib=schema_madlib,
                        depvar=dependent_varname,
                        indepvar=independent_varname,
                        num_categories=num_categories,
                        ref_category=ref_category,
                        optimizer=optimizer),
         terminateExpr="""
             {schema_madlib}.__internal_mlogregr_{optimizer}_step_distance(
                 {{newState}}, {{oldState}}
             ) < {precision}
             """.format(schema_madlib=schema_madlib,
                        optimizer=optimizer,
                        precision=precision),
         max_num_iterations=max_iter)
 # -------------------------------------------------------------------------


 def mlogregr_train(schema_madlib, source_table, output_table, dependent_varname,
                    independent_varname, ref_category, optimizer_params,
                    *args, **kwargs):
     """
     Args:
         @param schema_madlib: string, Name of the MADlib schema, properly escaped/quoted
         @param source_table: string, Name of relation containing the training data
         @param output_table: string, Name of output relation containing the trained model
         @param dependent_varname: string, Name of dependent column in training data (of type INTEGER)
         @param independent_varname: string, Name of independent column in training data (of type
                DOUBLE PRECISION[])
         @param ref_category: integer, The user-specified reference category
                                 (value in [0, num_categories-1])
         @param optimizer_params: string, Optimization algorithm parameters
                 Contains key=value pairs where key can be one of:
                     optimizer: Name of the optimizer. 'newton' or 'irls'
                     max_iter: Maximum number of iterations
                     precision: Terminate if two consecutive iterations have a difference
                         in the log-likelihood of less than <tt>precision</tt>. In other
                         words, we terminate if the objective function value has converged.
                         This convergence criterion can be disabled by specifying a negative
                         value.
         @param args, kwargs: We allow the caller to specify additional arguments
                (all of which will be ignored though). The purpose of this is to allow the
                caller to unpack a dictionary whose element set is a superset of
                the required arguments by this function.

     Returns:
         None

     Side effect:
         Creates two tables:
     The output table ('output_table' above) has the following columns
      'coef'          DOUBLE PRECISION[], -- Coefficients of regression
      'loglikelihood' DOUBLE PRECISION,   -- Log-likelihood value
      'std_err'       DOUBLE PRECISION[], -- Standard errors
      'z_stats'       DOUBLE PRECISION[], -- z-stats of the standard errors
      'p_values'      DOUBLE PRECISION[], -- p-values of the standard errors
      'num_iterations' INTEGER             -- Number of iterations performed by the optimizer

     The output summary table is named as <output_table>_summary has the following columns
     'source_table'   VARCHAR,            -- source table name
     'dep_var'        VARCHAR,            -- dependent variable name
     'ind_var'        VARCHAR,            -- independent variable name
     'num_rows_processed' INTEGER         -- Number of rows processed during training
     'num_missing_rows_skipped' INTEGER   -- Number of rows skipped during training due
                                          --    to missing values
     """
     # reduce the number of messages to user
     old_msg_level = plpy.execute("""
                                  SELECT setting FROM pg_settings
                                  WHERE name='client_min_messages'
                                  """)[0]['setting']
     plpy.execute("set client_min_messages to error")

     all_arguments = {'schema_madlib': schema_madlib,
                      'source_table': source_table,
                      'output_table': output_table,
                      'dependent_varname': dependent_varname,
                      'independent_varname': independent_varname,
                      'ref_category': ref_category,
                      'optimizer_params': optimizer_params
                      }

     # validate parameters
     _validate_params(**all_arguments)

     non_null_results = plpy.execute(""" SELECT count(DISTINCT {dependent_varname}) AS cnt,
                                                  max({dependent_varname}) as max_cat,
                                                  min({dependent_varname}) as min_cat
                                           FROM {source_table}
                                           WHERE
                                             {dependent_varname} IS NOT NULL
                                             AND {independent_varname} IS NOT NULL
                                             AND NOT {schema_madlib}.array_contains_null({independent_varname})

                                     """.format(**all_arguments))[0]
     num_categories, max_category, min_category = map(
         int, [non_null_results[i] for i in ('cnt', 'max_cat', 'min_cat')])


     _assert(max_category == num_categories-1 and min_category == 0,
             "Mlogregr error: Value of the dependent variable should be "
             "integers in the range [0, {0}]".format(num_categories-1))
     _assert(ref_category >= 0 and ref_category < num_categories,
             "Mlogregr error: Invalid reference category value {0}. "
             "It should be between 0 and {1}".format(ref_category,
                                                     num_categories-1))
     all_arguments['num_categories'] = num_categories

     all_arguments['total_rows'] = int(plpy.execute("""SELECT count(*) as cnt
                                                       FROM {0}""".
                                                    format(source_table))[0]["cnt"])
     all_arguments['num_features'] = plpy.execute("""
         SELECT
             array_upper({independent_varname}, 1) fnum
         FROM {source_table} LIMIT 1
         """.format(**all_arguments))[0]['fnum']

     optimizer_param_dict = _extract_params(schema_madlib, optimizer_params)
     all_arguments.update(optimizer_param_dict)

     # Perform the mlogistic computation
     n_iterations = compute_mlogregr(**all_arguments)

     # plpy.info("Creating output table")
     plpy.execute("""
         CREATE TABLE {output_table} AS
             SELECT
                 ({schema_madlib}.__mlogregr_format(
                     (result).coef, {num_features},
                     {num_categories}, {ref_category})
                 ).category AS category,
                 (result).ref_category as ref_category,
                 ({schema_madlib}.__mlogregr_format(
                     (result).coef, {num_features},
                     {num_categories}, {ref_category})
                 ).coef AS coef,
                 (result).log_likelihood as loglikelihood,
                 ({schema_madlib}.__mlogregr_format(
                     (result).std_err, {num_features},
                     {num_categories}, {ref_category})
                 ).coef AS std_err,
                 ({schema_madlib}.__mlogregr_format(
                     (result).z_stats, {num_features},
                     {num_categories}, {ref_category})
                 ).coef AS z_stats,
                 ({schema_madlib}.__mlogregr_format(
                     (result).p_values, {num_features},
                     {num_categories}, {ref_category})
                 ).coef AS p_values,
                 ({schema_madlib}.__mlogregr_format(
                     (result).odds_ratios, {num_features},
                     {num_categories}, {ref_category})
                 ).coef AS odd_ratios,
                 (result).condition_no as condition_no,
                 {n_iterations} as num_iterations,
                 (result).num_processed as num_processed,
                 ({total_rows} - (result).num_processed) as num_missing_rows_skipped
             FROM (
                 SELECT
                     {schema_madlib}.__internal_mlogregr_irls_result(
                         _madlib_state) AS result
                 FROM _madlib_iterative_alg
                 WHERE _madlib_iteration = {n_iterations}
             ) subq
             """.format(n_iterations=n_iterations, **all_arguments))

     # plpy.info("Fetching result")
     result = plpy.execute("""SELECT num_processed, num_missing_rows_skipped
                              FROM {output_table}
                           """.format(output_table=output_table))[0]
     if not result["num_processed"]:
         # when no rows have been processed, a NULL result is returned.
         # We need to capture that to ensure correct value for num_processed
         result["num_processed"] = 0
         result["num_missing_rows_skipped"] = all_arguments['total_rows']

     # plpy.info("Removing rows processed column out of output table since we "
     #           "should place it in summary table")
     plpy.execute("""ALTER TABLE {output_table}
                         DROP num_processed,
                         DROP num_missing_rows_skipped
                  """.format(output_table=output_table))

     # plpy.info("Creating summary table")
     plpy.execute(
         """
         CREATE TABLE {output_table}_summary AS
             SELECT
               '{source_table}'::VARCHAR            as source_table,
               '{output_table}'::VARCHAR            as out_table,
               '{dependent_varname}'::VARCHAR       as dependent_varname,
               '{independent_varname}'::VARCHAR     as independent_varname,
               '{optimizer_params}'::VARCHAR        as optimizer_params,
               {ref_category}::INTEGER              as ref_category,
               {num_processed}::INTEGER             as num_processed,
               {num_missing_rows_skipped}::INTEGER  as num_missing_rows_skipped
         """.format(num_processed=result["num_processed"],
                    num_missing_rows_skipped=result["num_missing_rows_skipped"],
                    **all_arguments))

     plpy.execute("set client_min_messages to " + old_msg_level)
     return None
 # -------------------------------------------------------------------------


 def _validate_params(schema_madlib,
                      source_table, output_table, dependent_varname,
                      independent_varname, *args, **kwargs):
     """
     Args:
         @param source_table: string, Name of input source table
         @param output_table: string, Name of output table
         @param dependent_varname: string, Name of the dependent column
         @param independent_varname: string, Name of the independent column
         @param num_categories: int, Number of categories for mlogit regression
         @param ref_category: integer, Value of the reference category

     Returns:
         None
     """
     _assert(source_table is not None and
             source_table.strip().lower() not in ('none', 'null', ''),
             "Mlogregr error: Invalid source table name")

     _assert(table_exists(source_table),
             "Mlogregr error: Source table {0} does not exist".
             format(source_table))

     _assert(output_table is not None and
             output_table.strip().lower() not in ('none', 'null', ''),
             "Mlogregr error: Invalid output table name")

     _assert(not table_exists(output_table),
             "Mlogregr error: Output table {0}"
             " already exists".format(str(output_table)))

     _assert(not table_exists(output_table + "_summary"),
             "Mlogregr error: Output table {0}_summary"
             " already exists".format(str(output_table)))

     _assert(not table_is_empty(source_table),
             "Mlogregr error: Source table {0} is empty".format(source_table))

     _assert(dependent_varname is not None and dependent_varname.strip().lower() not in ('null', ''),
             "Mlogregr error: Invalid dependent column name")

     _assert(independent_varname is not None and independent_varname.lower() not in ('null', ''),
             "Mlogregr error: Invalid independent column name")

     result_w_null = plpy.execute("""
         SELECT DISTINCT {dep} AS cat
         FROM {source}
         WHERE {dep} is not NULL
         """.format(source=source_table,
                    indep=independent_varname,
                    dep=dependent_varname))
     result_wo_null = plpy.execute("""
         SELECT DISTINCT {dep} AS cat
         FROM {source}
         WHERE {dep} is not NULL
             AND NOT {madlib}.array_contains_null({indep})
         """.format(madlib=schema_madlib, source=source_table,
                    indep=independent_varname,
                    dep=dependent_varname))

     categories_wo_null = set(i["cat"] for i in result_wo_null)
     categories_w_null = set(i["cat"] for i in result_w_null)

     _assert(categories_wo_null == categories_w_null,
             "Mlogregr error: All observations of category set {0} contain "
             "NULL values. These rows should be removed from the dataset "
             "before proceeding.".
             format(list(categories_w_null - categories_wo_null)))
 # -------------------------------------------------------------------------


 def _extract_params(schema_madlib, optimizer_params):
     """ Extract optimizer control parameter or set the default values

     @brief  optimizer_params is a string with the format of
            'max_iter=..., optimizer=..., precision=...'. The order
            does not matter. If a parameter is missing, then the default
            value for it is used. If optimizer_params is None or '',
            then all default values are used. If the parameter specified
            is not supported then an error is raised.
            This function also validates the values of these parameters.
            Supported parameters:
                 max_iter: integer (also aliased to 'max_num_iterations'. Default=20)
                 optimizer: string (can be one of ['irls', 'newton']. Default='irls')
                 precision: float (Default=0.0001)
     Returns:
         Dict. Dictionary of optimizer parameter values with key as parameter name
         and value as the parameter value

     Throws:
         "Mlogregr error" - If the parameter is unsupported or the value is
         not valid.
     """
     allowed_params = set(["max_iter", "max_num_iterations",
                           "optimizer", "precision", "tolerance"])
     parameter_dict = {'max_iter': 20, 'optimizer': "irls", 'precision': 0.0001}

     if not optimizer_params:
         return parameter_dict

     for s in preprocess_optimizer_params(optimizer_params):
         items = s.split("=")
         if (len(items) != 2):
             plpy.error("Mlogregr error: Optimizer parameter list has incorrect format")
         param_name = items[0].strip(" \"").lower()
         param_value = items[1].strip(" \"").lower()

         if param_name not in allowed_params:
             plpy.error(
                 """
                 Mlogregr error: {param_name} is not a valid parameter name.
                 Run:
                     SELECT {schema_madlib}.mlogregr_train('usage');
                 to see the allowed parameters.
                 """.format(param_name=param_name,
                            schema_madlib=schema_madlib))

         if param_name in ("max_iter", "max_num_iterations"):
             try:
                 parameter_dict["max_iter"] = int(param_value)
             except ValueError:
                 plpy.error("Mlogregr error: max_iter must be an integer")

         if param_name == "optimizer":
             parameter_dict["optimizer"] = param_value

         if param_name in ("precision", "tolerance"):
             try:
                 parameter_dict["precision"] = float(param_value)
             except ValueError:
                 plpy.error("Mlogregr error: precision must be a float value")

     if parameter_dict["max_iter"] <= 0:
         plpy.error("Mlogregr error: max_iter must be positive")

     return parameter_dict


 # -- Help Messages -----------------------------------------------------------


 def mlogregr_help_message(schema_madlib, message, **kwargs):
     """ Help message for Multinomial Logistic Regression

     @brief
     Args:
         @param schema_madlib string, Name of the schema madlib
         @param message string, Help message indicator

     Returns:
         String. Contain the help message string
     """
     if not message:
         help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Functionality: Multinomial logistic regression is a widely used
 regression analysis tool that models the outcomes of categorical dependent
 random variables. The model assumes that the conditional mean of the dependent
 categorical variables is the logistic function of an affine combination of
 independent variables. Multinomial logistic regression finds the vector of
 coefficients that maximizes the likelihood of the observations.

 For more details on function usage:
     SELECT {schema_madlib}.mlogregr_train('usage')

 For an example on using the function:
     SELECT {schema_madlib}.mlogregr_train('example')
         """
     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                                 USAGE
 -----------------------------------------------------------------------
  SELECT {schema_madlib}.mlogregr_train(
     'source_table',        -- VARCHAR: Name of the source table containing training data
     'output_table',        -- VARCHAR, Name of the output table to hold the trained model
     'dependent_varname',   -- VARCHAR, Name of the column containing the category values
                            --           (the values must be integers from 0 to num_categories-1)
     'independent_varname', -- VARCHAR, Name of the column containing the independent variables.
                            --           (Can also be an ARRAY expression)
     ref_category,          -- INTEGER, The value of reference category
                            --           (the value must be between 0 and num_categories-1)
     'optimizer_params'     -- VARCHAR, a comma-separated string with optimizer parameters
                            --   Valid optimizer parameters are:
                            --     max_iter: INTEGER, Maximum number of iterations to run algorithm
                            --     optimizer: VARCHAR, Optimizer algorithm to use
                            --                 (can be either 'newton' or 'irls')
                            --     precision: FLOAT8, Maximum number of iterations to run algorithm
  )

 -----------------------------------------------------------------------
                                 OUTUPT
 -----------------------------------------------------------------------
 The output table ('output_table' above) has the following columns
      coef           -- DOUBLE PRECISION[], Coefficients of regression
      loglikelihood  -- DOUBLE PRECISION, Log-likelihood value
      std_err        -- DOUBLE PRECISION[], Standard errors
      z_stats        -- DOUBLE PRECISION[], z-stats of the standard errors
      p_values       -- DOUBLE PRECISION[], p-values of the standard errors
      num_iterations -- INTEGER, Number of iterations performed by the optimizer

 The output summary table is named as <'output_table'>_summary has the following columns
     source_table             -- VARCHAR, Source table name
     out_table                -- VARCHAR, Output table name
     dep_var                  -- VARCHAR, Dependent variable name
     ind_var                  -- VARCHAR, Independent variable name
     optimizer_params         -- VARCHAR, Optimizer parameters used
     ref_category             -- INTEGER, The value of reference category used
     num_rows_processed       -- INTEGER, Number of rows processed during training
     num_missing_rows_skipped -- INTEGER, Number of rows skipped during training due
                                          to missing values
         """
     elif message in ['example', 'examples']:
         help_string = """
 -- Create sample data set
 DROP TABLE IF EXISTS test3;
 CREATE TABLE test3 (
     feat1 INTEGER,
     feat2 INTEGER,
     cat INTEGER
 );
 INSERT INTO test3(feat1, feat2, cat) VALUES
 (1,35,1),
 (2,33,0),
 (3,39,1),
 (1,37,1),
 (2,31,1),
 (3,36,0),
 (2,36,1),
 (2,31,1),
 (2,41,1),
 (2,37,1),
 (1,44,1),
 (3,33,2),
 (1,31,1),
 (2,44,1),
 (1,35,1),
 (1,44,0),
 (1,46,0),
 (2,46,1),
 (2,46,2),
 (3,49,1),
 (2,39,0),
 (2,44,1),
 (1,47,1),
 (1,44,1),
 (1,37,2),
 (3,38,2),
 (1,49,0),
 (2,44,0),
 (3,61,2),
 (1,65,2),
 (3,67,1),
 (3,65,2),
 (1,65,2),
 (2,67,2),
 (1,65,2),
 (1,62,2),
 (3,52,2),
 (3,63,2),
 (2,59,2),
 (3,65,2),
 (2,59,0),
 (3,67,2),
 (3,67,2),
 (3,60,2),
 (3,67,2),
 (3,62,2),
 (2,54,2),
 (3,65,2),
 (3,62,2),
 (2,59,2),
 (3,60,2),
 (3,63,2),
 (3,65,2),
 (2,63,1),
 (2,67,2),
 (2,65,2),
 (2,62,2),
 (NULL,67,2),
 (2,NULL,2),
 (NULL,NULL,2),
 (2,62,NULL);

 -- Run the multilogistic regression function.
 DROP TABLE IF EXISTS test3_output;
 DROP TABLE IF EXISTS test3_output_summary;
 SELECT madlib.mlogregr_train('test3',
                              'test3_output',
                              'cat',
                              'ARRAY[1, feat1, feat2]',
                              0,
                              'max_iter=20, optimizer=irls, precision=0.0001'
                              );
         """
     else:
         help_string = "No such option. Use {schema_madlib}.mlogregr_train()"

     return help_string.format(schema_madlib=schema_madlib)
 # ---------------------------------------------------------------------------