src/ports/postgres/modules/stats/cox_prop_hazards.py_in - madlib - Git at Google

 m4_changequote(`<!', `!>')
 # coding=utf-8
 """
 @file cox_prop_hazards.py_in

 @brief Cox prop Hazards: Driver functions

 @namespace coxprophazards

 Cox prop Hazards: Driver functions

 //"""

 import plpy
 import re

 from utilities.control import MinWarning
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
 from utilities.validate_args import get_cols
 from utilities.validate_args import is_var_valid
 from utilities.utilities import unique_string
 from utilities.utilities import preprocess_keyvalue_params
 from utilities.utilities import _assert
 from utilities.utilities import get_seg_number
 from utilities.utilities import add_postfix
 from utilities.utilities import py_list_to_sql_string
 from utilities.validate_args import columns_exist_in_table
 from utilities.utilities import __mad_version
 from utilities.control import IterationController2S
 from convex.utils_regularization import utils_ind_var_scales
 import random

 # ----------------------------------------------------------------------

 version_wrapper = __mad_version()
 madvec = version_wrapper.select_vecfunc()


 def coxph_help_message(schema_madlib, message, **kwargs):
     """ Help message for Cox Proportional Hazards

     @brief
     Args:
         @param schema_madlib string, Name of the schema madlib
         @param message string, Help message indicator

     Returns:
         String. Contain the help message string
     """
     if not message:
         help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
 Functionality: Cox proprtional hazards regression (Breslow method)
 Proportional-Hazard models enable the comparison of various survival models.
 These survival models are functions describing the probability of a one-item
 event (prototypically, this event is death) with respect to time.
 The interval of time before death occurs is the survival time.
 Let T be a random variable representing the survival time,
 with a cumulative probability function P(t). Informally, P(t) is
 the probability that death has happened before time t.

 For more details on function usage:
     SELECT {schema_madlib}.coxph_train('usage')
 """

     elif message in ['usage', 'help', '?']:
         help_string = """
 -----------------------------------------------------------------------
                                 USAGE
 -----------------------------------------------------------------------
  SELECT {schema_madlib}.coxph_train(
  'source_table',            -- Name of data table
  'output_table',               -- Name of result table (overwrites if exists)
  'dependent_variable',      -- Name of column for dependent variables
  'independent_variable',    -- Name of column for independent variables
                                   (can be any SQL expression Eg: '*')
  'right_censoring_status',  -- Name of the column containing censoring status
                                     0/false : If the observation is censored
                                     1/true : otherwise
                                 Can also be an SQL expression: 'dependent_variable < 10')
                                (Optional, DEFAULT = TRUE)
  'strata',                  -- The stratification column names. (Optional, DEFAULT = NULL)
  'optimizer_params'         -- The optimizer parameters as a comma-separated string
  );

 -----------------------------------------------------------------------
                                 OUTUPT
 -----------------------------------------------------------------------
 The output table ('output_table' above) has the following columns
      'coef'          DOUBLE PRECISION[], -- Coefficients of regression
      'loglikelihood' DOUBLE PRECISION,   -- Log-likelihood value
      'std_err'       DOUBLE PRECISION[], -- Standard errors
      'z_stats'       DOUBLE PRECISION[], -- z-stats of the standard errors
      'p_values'      DOUBLE PRECISION[], -- p-values of the standard errors
      'num_iterations'      INTEGER       -- Number of iterations performed by the optimizer

 The output summary table is named as <output_table>_summary has the following columns
     'source_table'            VARCHAR, Source table name
     'dep_var'                 VARCHAR, Dependent variable name
     'ind_var'                 VARCHAR, Independent variable name
     'right_censoring_status'  VARCHAR, Right censoring status
     'strata'                  VARCHAR, Stratification columns
     num_rows_processed        INTEGER, Number of rows processed during training
     num_missing_rows_skipped  INTEGER, Number of rows skipped during training
                                         due to missing values

         """
     else:
         help_string = "No such option. Use {schema_madlib}.coxph_train()"

     return help_string.format(schema_madlib=schema_madlib)
 # ------------------------------------------------------------


 def coxph(
         schema_madlib,
         source_table,
         output_table,
         dependent_varname,
         independent_varname,
         right_censoring_status,
         strata,
         optimizer_params,
         *args,
         **kwargs):
     """ Cox proportional hazards regression training function
     @brief Cox proportional hazards regression, with
     stratification support.

     Args:
         @param schema_madlib - MADlib schema name
         @param source_table - A string, the data table name
         @param output_table - A string, the result table name
         @param dependent_varname - A string, the survival time
         column name or a valid expression
         @param independent_varname - A string, the covariates in
         array formats. It is a valid expression.
         @param right_censoring_status - A string, a column name
         or a valid expression that has boolean values. Whether
         the row of data is censored. Default is 'TRUE'.
         @param strata - A string, column names seprated by
         commas. The columns used for stratification. Default is
         None.
         @param optimizer_params - A string, which contains
         key=value pairs separated by commas. Default values:
         max_iter=20, optimizer='newton', tolerance=1e-4.

     Returns:
         A table named by output_table, which contains the
         following columns:
         * coef - An array of double precision values, fitting
         coefs
         * std_err - An array of double precision values,
         standard erros of coef
         * z - An array of double precision values, z statistics
         * p - An array of double precision values, p value
     """
     old_msg_level = plpy.execute(
         """
             select setting from pg_settings
             where name='client_min_messages'
             """)[0]['setting']
     plpy.execute("set client_min_messages to warning")

     all_arguments = {
         'schema_madlib': schema_madlib,
         'source_table': source_table,
         'output_table': output_table,
         'dependent_varname': dependent_varname,
         'independent_varname': independent_varname,
         'right_censoring_status': right_censoring_status,
         'strata': strata
     }
     _validate_params(**all_arguments)
     indepColumn = __check_args(schema_madlib, source_table,
                                independent_varname, dependent_varname,
                                right_censoring_status)

     (max_iter, optimizer, tolerance, array_agg_size,
      sample_size) = _extract_params(schema_madlib, optimizer_params)

     # Number of features
     n_features = plpy.execute(
         """
             select array_upper({independent_varname}, 1)
             as n_features from {source_table} limit 1
             """.format(
             independent_varname=independent_varname,
             source_table=source_table))[0]['n_features']

     # Use the quick split to re-distribute the data
     # construct a list of other columns
     other_cols = [right_censoring_status]
     new_source_table, index, dep, indep, status, n_processed, n_skipped, real_distid, std_str = quick_split(
         schema_madlib, source_table, dependent_varname, n_features,
         independent_varname, right_censoring_status, strata,
         array_agg_size, sample_size, True)
     if n_processed > 0:
         compute_coxph(schema_madlib, new_source_table, output_table,
                       index, dep, indep, n_features, status, strata, optimizer,
                       max_iter, tolerance, real_distid, std_str)
         plpy.execute('drop table if exists ' + new_source_table)
     else:
         plpy.execute(
             """
                 create table {output_table} (
                     coef           DOUBLE PRECISION[],
                     loglikelihood  DOUBLE PRECISION,
                     std_err        DOUBLE PRECISION[],
                     z_stats        DOUBLE PRECISION[],
                     p_values       DOUBLE PRECISION[],
                     hessian        DOUBLE PRECISION[],
                     num_iterations INTEGER
                 );
                 """.format(output_table=output_table))
         plpy.execute(
             """
                 insert into {output_table} values
                     (NULL, NULL, NULL, NULL, NULL, NULL, 1);
                 """.format(output_table=output_table))

     # the summary table
     output_table_summary = add_postfix(output_table, "_summary")
     plpy.execute(
         """
             create table {output_table_summary} as
                 select
                     'coxph'::varchar  as method,
                     '{source_table}'::varchar as source_table,
                     '{output_table}'::varchar as out_table,
                     '{dependent_varname}'::varchar as dependent_varname,
                     '{independent_varname}'::varchar as independent_varname,
                     {right_censoring_status}::varchar as right_censoring_status,
                     {strata}::varchar as strata,
                     {n_processed}::integer as num_processed,
                     {n_skipped}::integer as num_missing_rows_skipped;
             """.format(
             source_table=source_table,
             output_table_summary=output_table_summary,
             output_table=output_table,
             dependent_varname=dependent_varname,
             independent_varname=independent_varname,
             right_censoring_status=("NULL::text" if right_censoring_status is None
                                     else "'" + right_censoring_status + "'"),
             strata=("NULL::text" if strata is None
                     else "'" + strata + "'"),
             n_processed=n_processed,
             n_skipped=n_skipped))

     plpy.execute("set client_min_messages to " + old_msg_level)

     return None
 # ------------------------------------------------------------


 def compute_coxph(schema_madlib, source_table, output_table, index, dep,
                   indep, n_features, status, strata, optimizer, max_iter,
                   precision, real_distid, std_str):
     """ Use the old sequential algorithm to solve coxph

     @brief Run ordered aggregate on the re-distributed data.
     Each row of the table contains many original rows of the
     original data table in inverse order.
     """
     if max_iter < 1:
         plpy.error("Number of iterations must be positive")

     if optimizer not in ['newton']:
         plpy.error("Unknown optimizer requested. Must be 'newton'")

     # FIXME starting from random values or 0 ?
     coef = [0] * n_features
     L = float('-inf')

     # $1 - previous coef
     # $2 - coef limit, when it an array of all 0 (during 1-th iteration),
     # we compute the limit, and return it to Python. And then in the following
     # iterations, we use the limit.
     # the data is already sorted desc
     if strata is None:
         sql = """
             select
                 (f).*
             from (
                 select {schema_madlib}.coxph_improved_step(
                     {indep}, {dep}, {status}, $1, $2
                     order by {index}
                 ) as f
                 from {source_table}
                 group by {real_distid}  -- to avoid gathering data before aggregating
             ) s
         """.format(**locals())
     else:
         sql = """
             select
                 (f).*
             from
             (
                 select
                     {schema_madlib}.coxph_improved_strata_step_outer(inner_state) as f
                 from (
                     select
                         {schema_madlib}.coxph_improved_strata_step_inner(
                             {indep}, {dep}, {status}, $1, $2
                             order by {index}
                         ) as inner_state
                     from {source_table}
                     group by {strata}
                 ) t1
             ) t2
         """.format(**locals())

     old_coef = coef
     n_iter = 0
     max_coef = [-1] * n_features  # limit of coefficients
     sql_plan = plpy.prepare(sql, ["double precision[]", "double precision[]"])
     while True:
         n_iter += 1
         # The result contains coef, L, d2L
         result = plpy.execute(sql_plan, [coef, max_coef])[0]

         prev_L, L = L, result['l']
         new_coef = result['coef']
         if n_iter == 1:
             max_coef = result['max_coef']

         if L < prev_L:
             # Newton Raphson step
             coef = [(i + j)/2 for i, j in zip(coef, old_coef)]
         else:
             old_coef, coef = coef, new_coef

         if n_iter > max_iter or (L > prev_L and abs(1 - L / prev_L) <= precision):
             # exiting since max_iter iterations explored or
             #               very small increase in log_likelihood
             # Compute std_err, t_stats, p_value
             plpy.execute("""
                 CREATE TABLE {output_table} as
                     SELECT (f).*
                     FROM (
                         SELECT {schema_madlib}.compute_coxph_result(
                             {coef},
                             {L},
                             {d2L}, {n_iter}, {std_str}
                         ) as f
                     ) s;
                 """.format(output_table=output_table,
                            schema_madlib=schema_madlib,
                            coef=py_list_to_sql_string(coef),
                            L=L,
                            d2L=py_list_to_sql_string(result["d2l"]),
                            std_str=std_str,
                            n_iter=n_iter))
             return None
 # ------------------------------------------------------------


 # The new quick method. Used by PLANET algorithm for
 # decision tree. Since it has been successfully applied
 # onto an algorithm, no reason that we could not use it.
 def quick_split(
         schema_madlib,
         source_table,
         split_col,
         num_features,
         indep,
         status,
         strata,
         array_agg_size,
         sample_size,
         reverse=True,
         split_col_alias=''):
     """ Quickly find the splits of the split_col
     so that we can evenly cut the data without sorting

     @brief We want to cut the table to n even pieces, and at
     the same time keep split_col sorted. For example, in ARIMA,
     we need to partition the data into multiple chunks while
     each chunk contains consecutive time series. In CoxPH, we
     need to do the same thing. This step proves to be
     time-consuming becuase sorting (row_number over (order by
     ...)) is really slow.

     The solution is to work on a random sample of the original
     data instead of the whole data set.

     @param source_table The data table
     @param split_col    Find the cuts of this column
     @param num_features The number of features
     @param indep        The indepent variables
     """
     # Need the total number of records
     n_rows = plpy.execute(
         "select count(*) from {source_table}".format(
             source_table=source_table))[0]['count']

     # It might be cubersome to deal with NULL values in C++,
     # since a matrix of independent variables is passed into C++.
     filter_null = """
         {split_col} is not NULL and
         {schema_madlib}.array_contains_null({indep}) is False and
         {indep} is not NULL and
         ({status}) is not NULL
     """.format(schema_madlib=schema_madlib,
                split_col=split_col,
                indep=indep, status=status)

     # Number of rows to be processed
     n_processed = plpy.execute(
         "select count(*) from {source_table} where {filter_null}".format(
             source_table=source_table, filter_null=filter_null))[0]['count']
     n_skipped = n_rows - n_processed

     if n_processed == 0:
         return ('', 1, '', '',
                 '', n_processed, n_skipped, '', '')

     # Rewrite the num_splits
     n_rows_in_chunk = array_agg_size / num_features
     num_splits = (1 if n_rows < n_rows_in_chunk else n_rows / n_rows_in_chunk)

     # So that we could compute the percentage of the sample
     # We sample a few more values to make sure we can get enough
     # samples, otherwise the number of samples might be smaller
     # than sample_size.
     percentage = sample_size / n_rows + 0.01
     n_per_seg = int(sample_size / get_seg_number()) + 1

     output_table = unique_string() + '_redist'

     # nomalize the data to avoid possible overflow
     x_mean = plpy.execute(
         """
             select {schema_madlib}.array_avg({indep}, false) as xmean from {source_table}
             where {filter_null}
             """.format(
             schema_madlib=schema_madlib,
             indep=indep,
             source_table=source_table,
             filter_null=filter_null))[0]['xmean']
     mean_str = "array[" + ",".join(str(v) for v in x_mean) + "]"
     scales = plpy.execute(
         """
             select {schema_madlib}.array_avg(
                 {schema_madlib}.array_sub({indep}::float8[], {mean}::float8[]), true) as scale
             from {source_table} where {filter_null}
             """.format(
             schema_madlib=schema_madlib,
             indep=indep,
             mean=mean_str,
             source_table=source_table,
             filter_null=filter_null))[0]['scale']
     scale_str = "array[" + ",".join(str(v) if v != 0 else "1" for v in scales) + "]"

     # compute the break points
     # Use a fixed number of rows of sample to approximate the
     # break points.
     # Each bin will have approximate the same number of records.
     splits = plpy.execute(
         """
             select
                 {schema_madlib}._compute_splits(
                     {split_col}, {n_per_seg}, {num_splits})
                     as splits
             from {source_table}
             where random() <= {percentage} and {filter_null}
             """.format(
             schema_madlib=schema_madlib,
             split_col=split_col,
             n_per_seg=n_per_seg,
             num_splits=num_splits,
             source_table=source_table,
             filter_null=filter_null,
             percentage=percentage))[0]['splits']

     # Since the array of break points is not big
     # we load it into memory.
     splits_str = ('NULL::DOUBLE PRECISION[]' if splits is None
                   else "array[" + ",".join(str(split) for split in splits) + "]")

     # Use the same convention as _redistribute_data
     # ie. hard code dep, and indep as column names
     # NOTE: CoxPH will process the data in a reversed order. Thus
     # the latest data points will be grouped into the group 0.
     indep_name = unique_string() + '_indep'  # avoid name conflicts
     distid = unique_string() + '_distid'
     if split_col_alias == "":
         split_col_alias = unique_string() + '_split_alias'
     status_name = unique_string() + '_status'
     order_str = 'desc' if reverse else ''

     # Each big rows may contain different number of original small rows
     # since the quick split is approximate. But this is perfectly fine.
     if strata is None:
         real_distid = unique_string() + '_real_distid'
         strata_sql = """
             CREATE TEMP TABLE {output_table} AS
             select
                 0 as {real_distid},   -- to ensure that all chunks are stored in the same segment
                 {distid},
                 array_agg({split_col_alias} ORDER BY {split_col_alias} {order_str},
                         {status_name}) as {split_col_alias},
                 array_agg({status_name} ORDER BY {split_col_alias} {order_str},
                         {status_name}) as {status_name},
                 {schema_madlib}.matrix_agg(
                         {schema_madlib}.utils_normalize_data(
                             {indep_name}, {mean_str}, {scale_str}
                         )
                     ORDER BY {split_col_alias} {order_str}, {status_name}) as {indep_name}
             from
             (
                 select
                     {schema_madlib}._compute_grpid(
                         {splits}, {split_col}, {reverse})
                     as {distid},
                     {split_col} as {split_col_alias},
                     {indep} as {indep_name},
                     ({status})::INTEGER as {status_name}
                 from
                     {source_table}
                 where {filter_null}
             ) table_split
             group by {distid}
             m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY ({real_distid})!>);
             """.format(
             output_table=output_table,
             schema_madlib=schema_madlib,
             distid=distid,
             splits=splits_str,
             reverse='true' if reverse else 'false',
             split_col_alias=split_col_alias,
             split_col=split_col,
             indep=indep,
             indep_name=indep_name,
             status_name=status_name,
             status=status,
             filter_null=filter_null,
             source_table=source_table,
             order_str=order_str,
             mean_str=mean_str,
             scale_str=scale_str,
             real_distid=real_distid)
         plpy.execute(strata_sql)
         return (output_table, distid, split_col_alias, indep_name, status_name,
                 n_processed, n_skipped, real_distid, scale_str)
     else:
         plpy.execute("""
             create temp table {output_table} as
             select
                 {strata},
                 {distid},
                 array_agg({split_col_alias} order by {split_col_alias} {order_str},
                         {status_name}) as {split_col_alias},
                 array_agg({status_name} order by {split_col_alias} {order_str},
                         {status_name}) as {status_name},
                 {schema_madlib}.matrix_agg(
                         {schema_madlib}.utils_normalize_data(
                             {indep_name}, {mean_str}, {scale_str}
                         )
                     order by {split_col_alias} {order_str}, {status_name}) as {indep_name}
             from
             (
                 select
                     {strata},
                     {schema_madlib}._compute_grpid(
                         {splits}, {split_col}, {reverse})
                     as {distid},
                     {split_col} as {split_col_alias},
                     {indep} as {indep_name},
                     ({status})::INTEGER as {status_name}
                 from
                     {source_table}
                 where {filter_null}
             ) table_split
             group by {distid}, {strata}
             -- ensure that all chunks belonging to the same strata go to the same segment
             m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY ({strata})!>)
             """.format(
             output_table=output_table,
             schema_madlib=schema_madlib,
             distid=distid,
             splits=splits_str,
             reverse='true' if reverse else 'false',
             split_col_alias=split_col_alias,
             split_col=split_col,
             indep=indep,
             indep_name=indep_name,
             status_name=status_name,
             status=status,
             strata=strata,
             filter_null=filter_null,
             source_table=source_table,
             mean_str=mean_str,
             scale_str=scale_str,
             order_str=order_str))
         return (output_table, distid, split_col_alias, indep_name, status_name,
                 n_processed, n_skipped, '', scale_str)
 # ----------------------------------------------------------------------


 def _validate_params(schema_madlib, source_table, output_table,
                      dependent_varname, independent_varname,
                      right_censoring_status, strata, *args, **kwargs):
     """ Validate the input parameters for coxph

     Args:
         @param schema_madlib - MADlib schema name
         @param source_table - A string, the data table name
         @param output_table - A string, the result table name
         @param dependent_varname - A string, the survival time column
             name or a valid expression
         @param independent_varname - A string, the covariates in array
             formats. It is a valid expression.
         @param right_censoring_status - A string, a column name or a
             valid expression that has boolean values. Whether the row
             of data is censored. Default is 'TRUE'.
         @param strata - A string, column name seprated by commas. The
             columns used for stratification. Default is None.

     Throws:
         "Cox error" if any argument is invalid
     """
     # _assert(source_table is not None and table_exists(source_table),
     #         "Cox error: Source data table does not exist!")

     _assert(not table_exists(output_table, only_first_schema=True),
             "Cox error: Output table {0}"
             " already exists!".format(str(output_table)))

     output_table_summary = add_postfix(output_table, "_summary")
     _assert(not table_exists(output_table_summary, only_first_schema=True),
             "Cox error: Output table {0}"
             " already exists!".format(str(output_table_summary)))

     if strata is not None:
         strata_cols = [a.strip() for a in strata.split(",")]
         _assert(columns_exist_in_table(source_table, strata_cols,
                                        schema_madlib),
                 "ARIMA error: {1} columns do not exist in {0}!"
                 .format(source_table, strata_cols))
     return None
 # ----------------------------------------------------------------------


 def _extract_params(schema_madlib, optimizer_params):
     """ Extract optimizer control parameter or set the default values

     @brief  optimizer_params is a string with the format of
            'max_iter=..., optimizer=..., tolerance=...'. The order
            does not matter. If a parameter is missing, then the default
            value for it is used. If optimizer_params is None or '',
            then all default values are used. If the parameter specified
            is none of 'max_iter', 'optimizer', or 'tolerance' then an
            error is raised. This function also validates the values of
            these parameters.

     Throws:
         "Cox error" - If the parameter is unsupported or the value is
         not valid.
     """
     allowed_params = set(["max_iter", "optimizer", "tolerance",
                           "array_agg_size", "sample_size"])
     name_value = dict(max_iter=100, optimizer="newton", tolerance=1e-8,
                       array_agg_size=10000000, sample_size=1000000)

     if optimizer_params is None or len(optimizer_params) == 0:
         return (name_value['max_iter'], name_value['optimizer'],
                 name_value['tolerance'], name_value['array_agg_size'],
                 name_value['sample_size'])

     for s in preprocess_keyvalue_params(optimizer_params):
         items = s.split("=")
         if (len(items) != 2):
             plpy.error("Cox error: Optimizer parameter list has incorrect format!")
         param_name = items[0].strip(" \"").lower()
         param_value = items[1].strip(" \"").lower()

         if param_name not in allowed_params:
             plpy.error(
                 """
                 Cox error: {param_name} is not a valid parameter name.
                 Run:
                     SELECT {schema_madlib}.coxph('usage');
                 to see the allowed parameters.
                 """.format(param_name=param_name,
                            schema_madlib=schema_madlib))

         if param_name == "array_agg_size":
             try:
                 name_value["array_agg_size"] = int(param_value)
             except:
                 plpy.error("Cox error: array_agg_size must be an integer value!")

         if param_name == "sample_size":
             try:
                 name_value["sample_size"] = int(param_value)
             except:
                 plpy.error("Cox error: sample_size must be an integer value!")

         if param_name == "max_iter":
             try:
                 name_value["max_iter"] = int(param_value)
             except:
                 plpy.error("Cox error: max_iter must be an integer number!")

         if param_name == "optimizer":
             name_value["optimizer"] = param_value

         if param_name == "tolerance":
             try:
                 name_value["tolerance"] = float(param_value)
             except:
                 plpy.error("Cox error: tolerance must be a double precision value!")

     if name_value["max_iter"] <= 0:
         plpy.error("Cox error: max_iter must be positive!")

     if name_value["optimizer"] != "newton":
         plpy.error("Cox error: this optimization method is not supported yet!")

     if name_value["tolerance"] < 0:
         plpy.error("Cox error: tolerance cannot be smaller than 0!")

     if name_value["array_agg_size"] <= 0:
         plpy.error("Cox error: array_agg_size must be positive!")

     if name_value["sample_size"] <= 0:
         plpy.error("Cox error: sample_size must be positive!")

     return (name_value['max_iter'], name_value['optimizer'],
             name_value['tolerance'], name_value['array_agg_size'],
             name_value['sample_size'])

 # ----------------------------------------------------------------------


 def __check_args(schema_madlib, tbl_source, col_ind_var, col_dep_var, col_status):
     _assert(tbl_source is not None,
             "Cox Proportional Hazards Error: Source table should not be NULL!")
     _assert(col_ind_var is not None,
             "Cox Proportional Hazards Error: Independent variable should not be NULL!")
     _assert(col_dep_var is not None,
             "Cox Proportional Hazards Error: Dependent variable should not be NULL!")

     _assert(table_exists(tbl_source),
             "Cox Proportional Hazards Error: Source table " + tbl_source + " does not exist!")
     _assert(not table_is_empty(tbl_source),
             "Cox Proportional Hazards Error: Source table " + tbl_source + " is empty!")
     _assert(columns_exist_in_table(tbl_source, [col_dep_var]),
             "Cox Proportional Hazards Error: Dependent variable does not exist!")

     _assert(is_var_valid(tbl_source, col_ind_var),
             "Cox Proportional Hazards Error: The independent variable does not exist!")
     _assert(is_var_valid(tbl_source, col_status),
             "Cox Proportional Hazards Error: Not a valid boolean expression for status!")

     col_ind_var_new = col_ind_var
     cols = get_cols(tbl_source)

     # Select al columns except status and dependent variable
     if col_ind_var == "*":
         cols = get_cols(tbl_source)
         outstr_array = []
         for each_col in cols:
             if each_col != col_dep_var.lower() and each_col not in col_status.lower():
                 outstr_array.append(each_col)
         col_ind_var_new = 'array[%s]' % (','.join(outstr_array))
     return col_ind_var_new

 # -----------------------------------------------------------------------
 # ZPH functionality
 # -----------------------------------------------------------------------


 def zph_help_message(schema_madlib, message, **kwargs):
     """ Help message for function to test the proportional hazards assumption
     for a Cox regression model fit

     @brief
     Args:
         @param schema_madlib string, Name of the schema madlib
         @param message string, Help message indicator

     Returns:
         String. Contain the help message string
     """
     if not message:
         help_string = """
         -----------------------------------------------------------------------
                                     SUMMARY
         -----------------------------------------------------------------------
         Functionality: Test of proportional hazards assumption
         Proportional-Hazard models enable the comparison of various survival models.
         See {schema_madlib}.coxph_train() for details to create a Cox PH model.

         These PH models, however, assume that the hazard for a given individual
         is a fixed proportion of the hazard for any other individual, and the
         ratio of the hazards is constant across time.

         The cox_zph() function is used to test this assumption by computing the
         correlation of the residual of the Cox PH model with time.

         For more details on function usage:
             SELECT {schema_madlib}.cox_zph('usage')

         For an example on using the function:
             SELECT {schema_madlib}.cox_zph('example')
         """
     elif message in ['usage', 'help', '?']:
         help_string = """
         -----------------------------------------------------------------------
                                     USAGE
         -----------------------------------------------------------------------
          SELECT {schema_madlib}.cox_zph(
                 'cox_model_table',    -- TEXT. The name of the table containing the Cox Proportional-Hazards model
                 'output_table',       -- TEXT. The name of the table where the test statistics are saved
          );

         -----------------------------------------------------------------------
                                    OUTUPT
         -----------------------------------------------------------------------
         The <output table> ('output_table' above) has the following columns
                 - covariate     TEXT. The names of independent variables
                 - rho           FLOAT8[]. Vector of the correlation coefficients between
                                       survival time and the scaled Schoenfeld residuals
                 - chi_square    FLOAT8[]. Chi-square test statistic for the correlation analysis
                 - p_value       FLOAT8[]. Two-side p-value for the chi-square statistic

         The output residual table is named as <output_table>_residual has the following columns
                 - <dep_column_name> FLOAT8. Time values (dependent variable) present in the original source table.
                 - residual          FLOAT8[]. Difference between the original covariate value and the
                                           expectation of the covariate obtained from the coxph model.
                 - scaled_reisdual   FLOAT8[]. Residual values scaled by the variance of the coefficients
         """
     elif message in ['example', 'examples']:
         help_string = """
         DROP TABLE IF EXISTS sample_data;
         CREATE TABLE sample_data (
             id INTEGER NOT NULL,
             grp DOUBLE PRECISION,
             wbc DOUBLE PRECISION,
             timedeath INTEGER,
             status BOOLEAN
         );

         -- Insert sample data
         COPY sample_data FROM STDIN DELIMITER '|';
           0 |   0 | 1.45 |        35 | t
           1 |   0 | 1.47 |        34 | t
           3 |   0 |  2.2 |        32 | t
           4 |   0 | 1.78 |        25 | t
           5 |   0 | 2.57 |        23 | t
           6 |   0 | 2.32 |        22 | t
           7 |   0 | 2.01 |        20 | t
           8 |   0 | 2.05 |        19 | t
           9 |   0 | 2.16 |        17 | t
          10 |   0 |  3.6 |        16 | t
          11 |   1 |  2.3 |        15 | t
          12 |   0 | 2.88 |        13 | t
          13 |   1 |  1.5 |        12 | t
          14 |   0 |  2.6 |        11 | t
          15 |   0 |  2.7 |        10 | t
          16 |   0 |  2.8 |         9 | t
          17 |   1 | 2.32 |         8 | t
          18 |   0 | 4.43 |         7 | t
          19 |   0 | 2.31 |         6 | t
          20 |   1 | 3.49 |         5 | t
          21 |   1 | 2.42 |         4 | t
          22 |   1 | 4.01 |         3 | t
          23 |   1 | 4.91 |         2 | t
          24 |   1 |    5 |         1 | t
         \.

         -- Run coxph function
         SELECT {schema_madlib}.coxph_train(
             'sample_data',
             'sample_cox',
             'timedeath',
             'ARRAY[grp,wbc]',
             'status');

         -- Get the Cox PH model
         SELECT * FROM sample_cox;

         -- Run the PH assumption test and obtain the results
         SELECT {schema_madlib}.cox_zph('sample_cox', 'sample_zph_output');
         SELECT * FROM sample_zph_output;

         """
     else:
         help_string = "No such option. Use {schema_madlib}.cox_zph()"

     return help_string.format(schema_madlib=schema_madlib)


 def zph(schema_madlib, cox_output_table, output_table):
     """ Compute the Schoenfeld residuals for a Hazards data table

     @brief Compute the Schoenfeld residuals for a Hazards data table
     by using an aggregate-defined window function

     Args:
         @param schema_madlib: string, Name of the MADlib schema
         @param cox_output_table: string, Name of the coxph output_table
     Returns:
         None
     """
     _validate_zph_params(schema_madlib, cox_output_table, output_table)
     cox_output_table_summary = add_postfix(cox_output_table, "_summary")
     rv = plpy.execute("""
         SELECT
             source_table,
             dependent_varname,
             independent_varname,
             right_censoring_status,
             strata
         FROM {cox_output_table_summary}
         """.format(cox_output_table_summary=cox_output_table_summary))
     source_table = rv[0]['source_table']
     dependent_variable = rv[0]['dependent_varname']
     independent_variable = rv[0]['independent_varname']
     right_censoring_status = rv[0]['right_censoring_status']
     strata = rv[0]['strata']
     _compute_residual(schema_madlib, source_table, output_table,
                       dependent_variable, independent_variable,
                       cox_output_table, right_censoring_status,
                       strata)
 # ----------------------------------------------------------------------


 def _validate_zph_params(schema_madlib, cox_model_table, output_table):
     """
     Args:
         @param schema_madlib: string, Name of the MADlib schema
         @param cox_model_table: string, Table name for Cox Prop Hazards model
         @param output_table: string, Output data table name
     Returns:
         None
     Throws:
         Error on any invalid parameter
     """
     if cox_model_table is None or cox_model_table.strip() == '':
         plpy.error("Cox error: NULL/Empty model table is given!")

     cox_model_table_summary = add_postfix(cox_model_table, "_summary")
     _assert(cox_model_table is not None and table_exists(cox_model_table)
             and table_exists(cox_model_table_summary),
             "Cox error: Model table {0} or summary table {1} "
             "does not exist!".format(cox_model_table, cox_model_table_summary))

     output_table_residual= add_postfix(output_table, "_residual")
     _assert((not table_exists(output_table, only_first_schema=True)) and
             (not table_exists(output_table_residual, only_first_schema=True)),
             "Cox error: Output table {0} or residual table {1} "
             "already exists!".format(output_table, output_table_residual))

     summary_columns = ["source_table", "dependent_varname",
                        "independent_varname", "right_censoring_status",
                        "strata"]
     _assert(columns_exist_in_table(cox_model_table + "_summary", summary_columns),
             "Cox error: At least one column from {0} missing in "
             "model table {1}". format(str(summary_columns), cox_model_table))
     return None
 # ----------------------------------------------------------------------


 def _compute_residual(schema_madlib, source_table, output_table,
                       dependent_variable, independent_variable,
                       cox_output_table,
                       right_censoring_status=None,
                       strata=None, **kwargs):
     """ Compute the Schoenfeld residuals for a Hazards model

     @brief Computes the Schoenfeld residuals for a Hazards data table
     by using an aggregate-defined window function and outputs to a table

     Args:
         @param schema_madlib: string, Name of the MADlib schema
         @param source_table: string, Input data table name
         @param output_table: string, Output data table name
         @param dependent_variable: string, Dependent variable name
         @param independent_variable: string, Independent variable name (could also be an expression)
         @param right_censoring_status: string, Column name with right censoring status
         @param cox_output_table: string, Output table of coxph
         @param strata: string, Comma-separated list of columns to stratify with
     Returns:
         None
     """
     if not right_censoring_status:
         right_censoring_status = 'TRUE'
     if strata:
         partition_str = "PARTITION BY {0}".format(strata)
     else:
         partition_str = ''

     coef = madvec(plpy.execute("SELECT coef FROM {table} ".
                                format(table=cox_output_table))[0]["coef"],
                   text=False)
     coef_str = "ARRAY" + str(coef)
     # We don't extract a copy of the Hessian 2D array, since Postgres/GPDB still
     # don't support getting a 2d array into plpython
     residual_table = unique_string()
     format_args = {'schema_madlib': schema_madlib,
                    'output': output_table,
                    'indep_column': independent_variable,
                    'dep_column': dependent_variable,
                    'status': right_censoring_status,
                    'cox_output_table': cox_output_table,
                    'source_table': source_table,
                    'residual_table': residual_table,
                    'coef_str': coef_str,
                    'partition_str': partition_str}

     # plpy.info("--------- Computing residuals --------- ")
     plpy.execute("""
         CREATE TEMP TABLE {residual_table} AS
         SELECT
             {dep_column},
             {schema_madlib}.array_sub(
                     x::DOUBLE PRECISION[],
                     expectation_x::DOUBLE PRECISION[]
             ) AS residual
         FROM
         (
             SELECT
                 {dep_column},
                 ({indep_column})::DOUBLE PRECISION[] AS x,
                 ({status})::BOOLEAN as status,
                 {schema_madlib}.zph_agg(
                     ({indep_column})::DOUBLE PRECISION[],
                     {coef_str}
                 ) OVER ({partition_str} ORDER BY {dep_column} DESC) AS expectation_x
             FROM {source_table}
             WHERE {dep_column} IS NOT NULL AND
                   NOT {schema_madlib}.array_contains_null(
                                             {indep_column}::DOUBLE PRECISION[])
         ) AS q1
         WHERE status is TRUE
         ORDER BY {dep_column} ASC
         m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY ({dep_column})!>)
         """.format(**format_args))

     n_uncensored = plpy.execute("""SELECT count(*)::INTEGER as n_uncensored
                                    FROM {table}
                                 """.format(table=residual_table))[0]["n_uncensored"]
     format_args['n_uncensored'] = n_uncensored

     # plpy.info("--------- Computing scaled residuals ---------")
     output_residual = add_postfix(output_table, "_residual")
     plpy.execute("""
         CREATE TABLE {output_residual} AS
         SELECT
             {dep_column},
             residual as residual,
             {schema_madlib}.__coxph_scale_resid(
                 {n_uncensored}::INTEGER,
                 (SELECT hessian FROM {cox_output_table}),
                 residual
             ) AS scaled_residual
         FROM
             {residual_table}
         m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY ({dep_column})!>)
         """.format(output_residual=output_residual, **format_args))

     # plpy.info("--------- Computing metrics ---------")
     mean = plpy.execute("""
         SELECT avg({dep_column}) AS w FROM {residual_table}
         """.format(**format_args))[0]['w']
     plpy.execute("""
         CREATE TABLE {output} AS
         SELECT
             ('{indep_column}')::TEXT as covariate, rho,
             (f).chi_square_stat as chi_square, (f).p_value as p_value
         FROM (
             SELECT
                 {schema_madlib}.array_elem_corr_agg(
                                 scaled_residual,
                                 ({dep_column} - {mean})::DOUBLE PRECISION)
                     AS rho,
                 {schema_madlib}.__coxph_resid_stat_agg(
                                 ({dep_column} - {mean})::DOUBLE PRECISION,
                                 residual,
                                 (SELECT hessian FROM {cox_output_table}),
                                 {n_uncensored}::INTEGER)
                     AS f
             FROM
                 {output_residual}
         ) AS q1
         m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED RANDOMLY!>)
         """.format(mean=mean, output_residual=output_residual, **format_args))
     # Cleanup
     plpy.execute('DROP TABLE IF EXISTS ' + residual_table)


 def cox_prop_hazards(schema_madlib, usage_string, **kwargs):
     plpy.warning("This function has been deprecated. Please use 'coxph_train' instead.")
     if usage_string == '':
         message = """
 Summary
 ------------------------------------------------------------------------------------
     Functionality: Cox proprtional hazards regression (Breslow method)
     SELECT {schema_madlib}.cox_prop_hazards(
         'source_table',
         'output_table',
         'dependent_variable',
         'independent_variable',
         'right_censoring_status'
     );
 For more details on function usage:
     SELECT {schema_madlib}.cox_prop_hazards('usage');
         """
     elif usage_string in ('usage', 'help', '?'):
         message = """
 Usage
 ------------------------------------------------------------------------------------
     SELECT {schema_madlib}.cox_prop_hazards(
         'source_table',            -- Name of data table
         'output_table',            -- Name of result table (overwrites if exists)
         'dependent_variable',      -- Name of column for dependent variables
         'independent_variable',    -- Name of column for independent variables
                                      (can be any SQL expression Eg: ''*'')
         ['right_censoring_status', -- Name of the column containing censoring status
                                    -- 0/false : If the observation is censored
                                    -- 1/true : otherwise
                                    -- Default is 1/true for all observations
                                    -- Can also be an SQL expression: 'dependent_variable < 10'
     );

 Output:
 ------------------------------------------------------------------------------------
     The output table (''output_table'' above) has the following columns:
         'coef'      DOUBLE PRECISION[],  -- Coefficients of regression
         'std_err'   DOUBLE PRECISION[], -- Standard errors
         'z_stats'   DOUBLE PRECISION[], -- z-stats of the standard errors
         'p_values'  DOUBLE PRECISION[], -- p-values of the standard errors
 """
     else:
         message = "No such option. Run SELECT {schema_madlib}.cox_prop_hazards()"
     return message.format(schema_madlib=schema_madlib)


 def _validate_predict(schema_madlib, model_table, source_table, id_col_name,
                       output_table, pred_type, reference):
     """ Validate the arguments
     """
     _assert(pred_type in ('linear_predictors', 'risk', 'terms'),
             "Cox predict error: Invalid prediction type.")
     _assert(reference in ('overall', 'strata'),
             "Cox predict error: Invalid reference type.")
     _assert(model_table and model_table.strip().lower() not in ('null', ''),
             "Cox predict error: Invalid model table.")
     _assert(table_exists(model_table),
             "Cox predict error: Model table is missing.")
     _assert(source_table and source_table.strip().lower() not in ('null', ''),
             "Cox predict error: Invalid source table.")
     _assert(table_exists(source_table),
             "Cox predict error: source table is missing.")
     _assert(not table_exists(output_table, only_first_schema=True),
             "Cox predict error: output table already exists.")
     _assert(columns_exist_in_table(model_table, ["coef"], schema_madlib),
             "Cox predict error: Invalid model table ({0})".format(model_table))
     model_summary = model_table + "_summary"
     _assert(table_exists(model_summary),
             "Cox predict error: Model summary table is missing.")
     _assert(columns_exist_in_table(model_summary,
                                    ["independent_varname", "strata"],
                                    schema_madlib),
             "Cox predict error: Invalid summary table ({0})".format(model_summary))
 # ------------------------------------------------------------------------------


 def coxph_predict(schema_madlib, model_table, source_table, id_col_name, output_table,
                   pred_type, reference, **kwargs):
     """
         Cox prediction function
     """
     if pred_type is None:
         pred_type = "linear_predictors"
     if reference is None:
         reference = "strata"
     _validate_predict(schema_madlib, model_table, source_table,
                       id_col_name, output_table, pred_type, reference)
     model_table_summary = model_table + "_summary"
     summary_elements = plpy.execute("SELECT strata, independent_varname FROM {0}"
                                     .format(model_table_summary))[0]

     strata = summary_elements['strata']
     independent_varname = summary_elements['independent_varname']
     term_avg = unique_string()

     select_strata = ""
     group_by_strata = ""
     where_strata = ""

     if pred_type == "terms":
         strata = None
     if strata is not None and reference == "strata":
         select_strata = "{strata}".format(**locals()) + ","
         group_by_strata = "group by {strata}".format(**locals())
         strat_cols_condition = ""
         els = []
         for s in strata.split(","):
             els.append("t." + s + " = s." + s)
         strat_cols_condition = " and ".join(els)
         where_strata = "where {strat_cols_condition}".format(**locals())

     if columns_exist_in_table(source_table, [id_col_name], schema_madlib):
         coxph_predict_id = id_col_name
     else:
         coxph_predict_id = 'coxph_predict_id'

     # resolve name conflicts in output table
     output_name = pred_type + "_output" if id_col_name == pred_type else pred_type
     if pred_type != "terms":
         sql_predict = """
             CREATE TABLE {output_table} AS
             SELECT {id_col_name} AS {coxph_predict_id},
                    {schema_madlib}._coxph_predict_resp(
                         coef,
                         {independent_varname},
                         {term_avg},
                         '{pred_type}'::TEXT) AS {output_name}
             FROM (
                 SELECT
                     {select_strata}
                     {schema_madlib}.avg({independent_varname}) as {term_avg}
                 FROM {source_table}
                 {group_by_strata}
             ) t,
             {model_table} m,
             {source_table} s
             {where_strata}
         """.format(**locals())
     else:
         sql_predict = """
             CREATE TABLE {output_table} AS
                 SELECT
                     {id_col_name} AS {coxph_predict_id},
                     {schema_madlib}._coxph_predict_terms(
                         coef,
                         {independent_varname},
                         {term_avg}
                         ) AS {output_name}
                 FROM (
                     SELECT
                         {select_strata}
                         {schema_madlib}.avg({independent_varname}) as {term_avg}
                     FROM {source_table}
                     {group_by_strata}
                 ) t,
                 {model_table} m,
                 {source_table} s
                 {where_strata}
         """.format(**locals())

     with MinWarning('warning'):
         plpy.notice("sql_predict:\n" + sql_predict)
         plpy.execute(sql_predict)
     return None


 def coxph_predict_help_message(schema_madlib, message, **kwargs):
     """ Help message for prediction using a CoxPH model

     @brief
     Args:
         @param schema_madlib string, Name of the schema madlib
         @param message string, Help message indicator

     Returns:
         String. Help message string
     """
     if not message:
         help_string = """
         -----------------------------------------------------------------------
                                     SUMMARY
         -----------------------------------------------------------------------
         Functionality: Prediction using a CoxPH model

         The prediction function is provided to calculate the linear
         predictors, risk or the linear terms for the given prediction data.

         For more details on function usage:
             SELECT {schema_madlib}.coxph_predict('usage')

         For an example on using the function:
             SELECT {schema_madlib}.coxph_predict('example')
         """
     elif message in ['usage', 'help', '?']:
         help_string = """
         -----------------------------------------------------------------------
                                     USAGE
         -----------------------------------------------------------------------
         SELECT {schema_madlib}.coxph_predict(
             'model_table',      -- TEXT. Name of the table containing the cox model.
             'source_table',     -- TEXT. Name of the table containing the prediction data.
             'id_col_name',      -- TEXT. Name of the id column in the source table.
             'output_table',     -- TEXT. Name of the table to store the prediction results in.
             'pred_type',        -- TEXT. Type of prediction. Can be one of 'linear_predictors',
                                             'risk' or 'terms'. Default = 'linear_predictors'.
                                       'linear_predictors' calculates the dot product of the
                                           independent variables and the coefficients.
                                       'risk' is the exponentiated value of the linear prediction.
                                       'terms' correspond to the linear terms obtained by multiplying
                                               the independent variables with their corresponding coefficients
                                               values (without further calculating the sum of these terms)
                                       The resulting predictions, in all of the above cases, are then
                                       centered around a reference level.
             'reference'         -- TEXT. Reference level to use for centering the predictions.
                                           Can be one of 'strata' or 'overall'. Default = 'strata'.
                                     Cox model is a relative risk model wherein the predictions are
                                     relative to the sample that they are taken from. Therefore, all
                                     predictions are centered around the mean of the covariates within
                                     each stratum by default. If it is instead desired to use the
                                     mean over all samples, reference='overall' can be
                                     specified. If there was no stratification involved, the reference
                                     parameter does not have any effect.

                                     Note 1:  For pred_type = 'terms', the predictions are always
                                     centered around the overall mean values of the covariates
                                     independent of stratification.

                                     Note 2: R uses 'sample' instead of 'overall' when referring to
                                     the overall mean value of the covariates as being the reference
                                     level.
         )
         -----------------------------------------------------------------------
                                    OUTUPT
         -----------------------------------------------------------------------
         The <output table> ('output_table' above) has the following columns
                 - id                TEXT. The id column name from the source table
                 - predicted_result  DOUBLE PRECISION. Result of prediction based of the value of
                                         the pred_type parameter
         """
     elif message in ['example', 'examples']:
         help_string = """
         DROP TABLE IF EXISTS sample_data;
         CREATE TABLE sample_data (
             id INTEGER NOT NULL,
             grp DOUBLE PRECISION,
             wbc DOUBLE PRECISION,
             timedeath INTEGER,
             status BOOLEAN
         );

         -- Insert sample data
         COPY sample_data FROM STDIN DELIMITER '|';
           0 |   0 | 1.45 |        35 | t
           1 |   0 | 1.47 |        34 | t
           3 |   0 |  2.2 |        32 | t
           4 |   0 | 1.78 |        25 | t
           5 |   0 | 2.57 |        23 | t
           6 |   0 | 2.32 |        22 | t
           7 |   0 | 2.01 |        20 | t
           8 |   0 | 2.05 |        19 | t
           9 |   0 | 2.16 |        17 | t
          10 |   0 |  3.6 |        16 | t
          11 |   1 |  2.3 |        15 | t
          12 |   0 | 2.88 |        13 | t
          13 |   1 |  1.5 |        12 | t
          14 |   0 |  2.6 |        11 | t
          15 |   0 |  2.7 |        10 | t
          16 |   0 |  2.8 |         9 | t
          17 |   1 | 2.32 |         8 | t
          18 |   0 | 4.43 |         7 | t
          19 |   0 | 2.31 |         6 | t
          20 |   1 | 3.49 |         5 | t
          21 |   1 | 2.42 |         4 | t
          22 |   1 | 4.01 |         3 | t
          23 |   1 | 4.91 |         2 | t
          24 |   1 |    5 |         1 | t
         \.

         -- Run coxph function
         SELECT {schema_madlib}.coxph_train(
             'sample_data',
             'sample_cox',
             'timedeath',
             'ARRAY[grp,wbc]',
             'status');

         -- View the Cox PH model
         SELECT * FROM sample_cox;

         -- Predict back on the original dataset
         SELECT madlib.coxph_predict('sample_cox',
                                     'sample_data',
                                     'id',
                                     'sample_pred',
                                     'risk');
         """
     else:
         help_string = "No such option. Use {schema_madlib}.coxph_predict()"

     return help_string.format(schema_madlib=schema_madlib)