| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file cox_prop_hazards.sql_in |
| * |
| * @brief SQL functions for cox proportional hazards |
| * @date July 2012 |
| * |
| * @sa For a brief introduction to cox regression, see the |
| * module description \ref grp_cox_prop_hazards |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_cox_prop_hazards |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#training">Training Function</a> |
| <li class="level1"><a href="#cox_zph">PHA Test Function</a> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Models the relationship between one or more independent predictor |
| @variables and the amount of time before an event occurs. |
| |
| Proportional-Hazard models enable the comparison of various survival models. |
| These survival models are functions describing the probability of a one-item |
| event (prototypically, this event is death) with respect to time. |
| The interval of time before death occurs is the survival time. |
| Let T be a random variable representing the survival time, |
| with a cumulative probability function P(t). Informally, P(t) is |
| the probability that death has happened before time t. |
| |
| @anchor training |
| @par Training Function |
| |
| Following is the syntax for the coxph_train() training function: |
| <pre class="syntax"> |
| coxph_train( source_table, |
| output_table, |
| dependent_variable, |
| independent_variable, |
| right_censoring_status, |
| strata, |
| optimizer_params |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. The name of the table containing input data.</dd> |
| <dt>output_table</dt> |
| <dd>TEXT. The name of the table where the output model is saved. |
| The output is saved in the table named by the <em>output_table</em> argument. It has the following columns: |
| <table class="output"> |
| <tr> |
| <th>coef</th> |
| <td>FLOAT8[]. Vector of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>loglikelihood</th> |
| <td>FLOAT8. Log-likelihood value of the MLE estimate.</td> |
| </tr> |
| <tr> |
| <th>std_err</th> |
| <td>FLOAT8[]. Vector of the standard error of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>stats</th> |
| <td>FLOAT8[]. Vector of the statistics of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>p_values</th> |
| <td>FLOAT8[]. Vector of the p-values of the coefficients.</td> |
| </tr> |
| <tr> |
| <th>hessian</th> |
| <td>FLOAT8[]. The Hessian matrix computed using the final solution.</td> |
| </tr> |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. The number of iterations performed by the optimizer.</td> |
| </tr> |
| </table> |
| </dd> |
| <dd> Additionally, a summary output table is generated that contains |
| a summary of the parameters used for building the Cox model. It is stored |
| in a table named <output_table>_summary. It has the following columns: |
| <table class="output"> |
| <tr> |
| <th>source_table</th> |
| <td>The source table name.</td> |
| </tr> |
| <tr> |
| <th>dependent_variable</th> |
| <td>The dependent variable name.</td> |
| </tr> |
| <tr> |
| <th>independent_variable</th> |
| <td>The independent variable name.</td> |
| </tr> |
| <tr> |
| <th>right_censoring_status</th> |
| <td>The right censoring status</td> |
| </tr> |
| <tr> |
| <th>strata</th> |
| <td>The stratification columns</td> |
| </tr> |
| <tr> |
| <th>num_processed</th> |
| <td>The number of rows that were actually used in the computation.</td> |
| </tr> |
| <tr> |
| <th>num_missing_rows_skipped</th> |
| <td>The number of rows that were skipped in the computation due the NULL values in them.</td> |
| </tr> |
| </table> |
| </dd> |
| |
| <dt>dependent_variable</dt> |
| <dd>TEXT. A string containing the name of a column that contains |
| an array of numeric values, or a string expression in the format 'ARRAY[1, x1, x2, x3]', |
| where <em>x1</em>, <em>x2</em> and <em>x3</em> are column names. Dependent |
| variables refer to the time of death. There is no need to pre-sort the data.</dd> |
| <dt>independent_variable</dt> |
| <dd>TEXT. The name of the independent variable.</dd> |
| <dt>right_censoring_status (optional)</dt> |
| <dd>TEXT, default: TRUE for all observations. A string containing an expression that evaluates to the right-censoring status for the observation—TRUE if the observation is not |
| censored and FALSE if the observation is censored. The string could contain |
| the name of the column containing the right-censoring status, a fixed Boolean |
| expression (i.e., 'true', 'false', '0', '1') that applies to all observations, |
| or a Boolean expression such as 'column_name < 10' that can be evaluated for each |
| observation.</dd> |
| <dt>strata (optional)</dt> |
| <dd>VARCHAR, default: NULL, which does not do any stratifications. A string of comma-separated column names that are the strata ID variables used to do stratification.</dd> |
| <dt>optimizer_params (optional)</dt> |
| <dd>VARCHAR, default: NULL, which uses the default values of optimizer parameters: max_iter=20, optimizer=newton, tolerance=1e-4. It should be a string that contains pairs of 'key=value' separated by commas.</dd> |
| </dl> |
| |
| @anchor cox_zph |
| @par Proportional Hazards Assumption Test Function |
| |
| The cox_zph() function tests the proportional hazards assumption (PHA) of a Cox regression. |
| |
| Proportional-hazard models enable the comparison of various survival models. |
| These PH models, however, assume that the hazard for a given individual is a |
| fixed proportion of the hazard for any other individual, and the ratio of the |
| hazards is constant across time. MADlib does not currently have support for |
| performing any transformation of the time to compute the correlation. |
| |
| The <em>cox_zph()</em> function is used to test this assumption by computing the correlation |
| of the residual of the coxph_train model with time. |
| |
| Following is the syntax for the cox_zph() function: |
| <pre class="syntax"> |
| cox_zph(cox_model_table, output_table) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>cox_model_table</dt> |
| <dd>TEXT. The name of the table containing the Cox Proportional-Hazards model.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. The name of the table where the test statistics are saved. |
| The output table is named by the <em>output_table</em> argument and has |
| the following columns: |
| <table class="output"> |
| <tr> |
| <th>covariate</th> |
| <td>TEXT. The independent variables.</td> |
| </tr> |
| <tr> |
| <th>rho</th> |
| <td>FLOAT8[]. Vector of the correlation coefficients between |
| survival time and the scaled Schoenfeld residuals.</td> |
| </tr> |
| <tr> |
| <th>chi_square</th> |
| <td> FLOAT8[]. Chi-square test statistic for the correlation analysis.</td> |
| </tr> |
| <tr> |
| <th>p_value</th> |
| <td>FLOAT8[]. Two-side p-value for the chi-square statistic.</td> |
| </tr> |
| </table> |
| </dd> |
| </dl> |
| |
| Additionally, the residual values are outputted to the table named <em>output_table</em>_residual. |
| The table contains the following columns: |
| <table class="output"> |
| <tr> |
| <th><dep_column_name></th> |
| <td>FLOAT8. Time values (dependent variable) present in the original source table. </td> |
| </tr> |
| <tr> |
| <th>residual</th> |
| <td>FLOAT8[]. Difference between the original covariate values and the |
| expectation of the covariates obtained from the coxph_train model.</td> |
| </tr> |
| <tr> |
| <th>scaled_reisdual</th> |
| <td>Residual values scaled by the variance of the coefficients.</td> |
| </tr> |
| </table> |
| |
| @anchor notes |
| @par Notes |
| |
| - Table names can be optionally schema qualified (current_schemas() is |
| used if a schema name is not provided) and table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| For instance, 'mytable' and 'MyTable' both resolve to the same entity—'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'. |
| |
| - The cox_prop_hazards_regr() and cox_prop_hazards() functions have been |
| deprecated; coxph_train() should be used instead. |
| |
| @anchor examples |
| @examp |
| -# View online help for the proportional hazards training method. |
| <pre class="example"> |
| SELECT madlib.coxph_train(); |
| </pre> |
| |
| -# Create an input data set. |
| <pre class="example"> |
| DROP TABLE IF EXISTS sample_data; |
| CREATE TABLE sample_data ( |
| id INTEGER NOT NULL, |
| grp DOUBLE PRECISION, |
| wbc DOUBLE PRECISION, |
| timedeath INTEGER, |
| status BOOLEAN |
| ); |
| COPY sample_data FROM STDIN DELIMITED BY '|'; |
| 0 | 0 | 1.45 | 35 | t |
| 1 | 0 | 1.47 | 34 | t |
| 3 | 0 | 2.2 | 32 | t |
| 4 | 0 | 1.78 | 25 | t |
| 5 | 0 | 2.57 | 23 | t |
| 6 | 0 | 2.32 | 22 | t |
| 7 | 0 | 2.01 | 20 | t |
| 8 | 0 | 2.05 | 19 | t |
| 9 | 0 | 2.16 | 17 | t |
| 10 | 0 | 3.6 | 16 | t |
| 11 | 1 | 2.3 | 15 | t |
| 12 | 0 | 2.88 | 13 | t |
| 13 | 1 | 1.5 | 12 | t |
| 14 | 0 | 2.6 | 11 | t |
| 15 | 0 | 2.7 | 10 | t |
| 16 | 0 | 2.8 | 9 | t |
| 17 | 1 | 2.32 | 8 | t |
| 18 | 0 | 4.43 | 7 | t |
| 19 | 0 | 2.31 | 6 | t |
| 20 | 1 | 3.49 | 5 | t |
| 21 | 1 | 2.42 | 4 | t |
| 22 | 1 | 4.01 | 3 | t |
| 23 | 1 | 4.91 | 2 | t |
| 24 | 1 | 5 | 1 | t |
| \. |
| </pre> |
| -# Run the Cox regression function. |
| <pre class="example"> |
| SELECT madlib.coxph_train( 'sample_data', |
| 'sample_cox', |
| 'timedeath', |
| 'ARRAY[grp,wbc]', |
| 'status' |
| ); |
| </pre> |
| -# View the results of the regression. |
| <pre class="example"> |
| \\x on |
| SELECT * FROM sample_cox; |
| </pre> |
| Results: |
| <pre class="result"> |
| -[ RECORD 1 ]----------------------------------------- |
| coef | {2.54449137803027,1.67183255057665} |
| std_err | {0.677308807341768,0.387308633304678} |
| z_stats | {3.75676700265663,4.31653830257251} |
| p_values | {0.000172122613528057,1.58495189046891e-05} |
| </pre> |
| -# View online help for the function to test Proportional Hazards Assumption. |
| <pre class="example"> |
| SELECT madlib.cox_zph(); |
| </pre> |
| |
| -# Run the test for Proportional Hazards assumption to obtain correlation between |
| residuals and time. |
| <pre class="example"> |
| SELECT madlib.cox_zph( 'sample_cox', |
| 'sample_zph_output' |
| ); |
| </pre> |
| -# View results of the PHA test. |
| <pre class="example"> |
| SELECT * FROM sample_zph_output; |
| </pre> |
| Results: |
| <pre class="result"> |
| covariate | ARRAY[grp,wbc] |
| rho | {0.00237407653178531,0.0375603884069487} |
| chi_square | {0.000100760403707082,0.0232322905624675} |
| p_value | {0.991991010621734,0.878854560410758} |
| </pre> |
| |
| |
| @anchor background |
| @par Technical Background |
| |
| Generally, proportional-hazard models start with a list of \f$ \boldsymbol n \f$ observations, |
| each with \f$ \boldsymbol m \f$ covariates and a time of death. From this |
| \f$ \boldsymbol n \times m \f$ matrix, we would like to derive the correlation |
| between the covariates and the hazard function. This amounts to finding |
| the parameters \f$ \boldsymbol \beta \f$ that best fit the model described below. |
| |
| Let us define: |
| - \f$ \boldsymbol t \in \mathbf R^{m} \f$ denote the vector of observed dependent |
| variables, with \f$ n \f$ rows. |
| - \f$ X \in \mathbf R^{m} \f$ denote the design matrix with \f$ m \f$ |
| columns and \f$ n \f$ rows, containing all observed vectors of independent |
| variables \f$ \boldsymbol x_i \f$ as rows. |
| - \f$ R(t_i) \f$ denote the set of observations still alive at time \f$ t_i \f$ |
| |
| Note that this model <b>does not</b> include a <b>constant term</b>, and the data |
| cannot contain a column of 1s. |
| |
| By definition, |
| \f[ |
| P[T_k = t_i | \boldsymbol R(t_i)] |
| = \frac{e^{\beta^T x_k} }{ \sum_{j \in R(t_i)} e^{\beta^T x_j}}. |
| \,. |
| \f] |
| |
| The <b>partial likelihood </b>function can now be generated as the product of |
| conditional probabilities: |
| \f[ |
| \mathcal L = \prod_{i = 1}^n |
| \left( |
| \frac{e^{\beta^T x_i}}{ \sum_{j \in R(t_i)} e^{\beta^T x_j}} |
| \right). |
| \f] |
| |
| The log-likelihood form of this equation is |
| \f[ |
| L = \sum_{i = 1}^n |
| \left[ \beta^T x_i |
| - \log\left(\sum_{j \in R(t_i)} e^{\beta^T x_j }\right) |
| \right]. |
| \f] |
| |
| Using this score function and Hessian matrix, the partial likelihood can be |
| maximized using the <b> Newton-Raphson algorithm</b>. <b>Breslow's method</b> |
| is used to resolved tied times of deaths. The time of death for two records are |
| considered "equal" if they differ by less than 1.0e-6 |
| |
| The inverse of the Hessian matrix, evaluated at the estimate of |
| \f$ \boldsymbol \beta \f$, can be used as an <b>approximate variance-covariance |
| matrix </b> for the estimate, and used to produce approximate |
| <b>standard errors</b> for the regression coefficients. |
| |
| \f[ |
| \mathit{se}(c_i) = \left( (H)^{-1} \right)_{ii} |
| \,. |
| \f] |
| The Wald z-statistic is |
| \f[ |
| z_i = \frac{c_i}{\mathit{se}(c_i)} |
| \,. |
| \f] |
| |
| The Wald \f$ p \f$-value for coefficient \f$ i \f$ gives the probability (under |
| the assumptions inherent in the Wald test) of seeing a value at least as extreme |
| as the one observed, provided that the null hypothesis (\f$ c_i = 0 \f$) is |
| true. Letting \f$ F \f$ denote the cumulative density function of a standard |
| normal distribution, the Wald \f$ p \f$-value for coefficient \f$ i \f$ is |
| therefore |
| \f[ |
| p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| )) |
| \f] |
| where \f$ Z \f$ is a standard normally distributed random variable. |
| |
| |
| The condition number is computed as \f$ \kappa(H) \f$ during the iteration |
| immediately <em>preceding</em> convergence (i.e., \f$ A \f$ is computed using |
| the coefficients of the previous iteration). A large condition number (say, more |
| than 1000) indicates the presence of significant multicollinearity. |
| |
| @anchor Literature |
| @literature |
| |
| A somewhat random selection of nice write-ups, with valuable pointers into |
| further literature: |
| |
| [1] John Fox: Cox Proportional-Hazards Regression for Survival Data, |
| Appendix to An R and S-PLUS companion to Applied Regression Feb 2012, |
| http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf |
| |
| [2] Stephen J Walters: What is a Cox model? |
| http://www.medicine.ox.ac.uk/bandolier/painres/download/whatis/cox_model.pdf |
| |
| @anchor related |
| @par Related Topics |
| |
| File cox_prop_hazards.sql_in documenting the functions |
| |
| @internal |
| @sa Namespace cox_prop_hazards |
| \ref madlib::modules::stats documenting the implementation in C++ |
| @endinternal |
| |
| */ |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_train() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.coxph_train(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_train( |
| message VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| PythonFunction(stats, cox_prop_hazards, coxph_help_message) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| ---------------------------------------------------------------------- |
| |
| /** |
| * @brief Compute cox-regression coefficients and diagnostic statistics |
| * |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @param source_table Name of the source relation containing the training data |
| * @param output_table Name of the output relation to contain the coxph_train model |
| * @param independent_varname Name of the independent column |
| * @param dependent_varname Name of the dependent column measuring time of death |
| * @param right_censoring_status Name of the column that determines right censoring support |
| * @param strata Comma-separated list of column names on which to stratify the data |
| * @param optimizer_params Comma-separated list of parameters for the optimizer function |
| * |
| * @return An output table with following columns: |
| * - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol \beta \f$ |
| * - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$l(\boldsymbol \beta)\f$ |
| * - <tt>std_err FLOAT8[]</tt> - Array of standard errors, |
| * \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$ |
| * - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$ |
| * - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$ |
| * - <tt>condition_no FLOAT8</tt> - The condition number of matrix |
| * \f$ H \f$ during the iteration immediately <em>preceding</em> |
| * convergence (i.e., \f$ H \f$ is computed using the coefficients of the |
| * previous iteration) |
| * - <tt>num_iterations INTEGER</tt> - The number of iterations before the |
| * algorithm terminated |
| * |
| * - Get vector of coefficients \f$ \boldsymbol \beta \f$ and all diagnostic |
| * statistics:\n |
| * <pre>SELECT \ref coxph_train( |
| * '<em>source_table</em>', '<em>output_table</em>', |
| * '<em>dependent_varname</em>', '<em>independent_varname</em>' |
| * '<em>right_censoring_status</em>', '<em>strata</em>' |
| * );</pre> |
| * - Get vector of coefficients \f$ \boldsymbol \beta \f$:\n |
| * <pre>SELECT coef from '<em>output_table</em>'</pre> |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.coxph_train( |
| source_table VARCHAR -- name of input table |
| , output_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , right_censoring_status VARCHAR -- censoring status |
| , strata VARCHAR -- comma separated stratifying column names |
| , optimizer_params VARCHAR -- a string with parameters separated by comma |
| ) |
| RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(`stats', `cox_prop_hazards') |
| cox_prop_hazards.coxph( |
| schema_madlib, source_table, output_table, dependent_varname, |
| independent_varname, right_censoring_status, strata, |
| optimizer_params) |
| $$ LANGUAGE plpythonu; |
| |
| ---------------------------------------------------------------------- |
| |
| /** |
| * @brief Cox regresison training function |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.coxph_train( |
| source_table VARCHAR -- name of input table |
| , output_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , right_censoring_status VARCHAR -- censored status |
| , strata VARCHAR -- stratification column names |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.coxph_train($1, $2, $3, $4, $5, $6, NULL) |
| $$ LANGUAGE sql VOLATILE; |
| |
| ---------------------------------------------------------------------- |
| |
| /** |
| * @brief Cox regresison training function |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.coxph_train( |
| source_table VARCHAR -- name of input table |
| , output_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| , right_censoring_status VARCHAR -- censored status |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.coxph_train($1, $2, $3, $4, $5, NULL) |
| $$ LANGUAGE sql VOLATILE; |
| |
| ---------------------------------------------------------------------- |
| |
| /** |
| * @brief Cox regresison training function |
| **/ |
| CREATE FUNCTION MADLIB_SCHEMA.coxph_train( |
| source_table VARCHAR -- name of input table |
| , output_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.coxph_train($1, $2, $3, $4, 'TRUE') |
| $$ LANGUAGE sql VOLATILE; |
| ------------------------------------------------------------------------- |
| |
| /** |
| * @brief Test the proportional hazards assumption for a Cox regression model fit (coxph_train) |
| |
| @param coxph_model_table Name of the table that contains the Cox Proportional Hazards model |
| @param output_table Name of the output table to contain the statistics to test the proportional hazards assumption |
| |
| @return An output table with following columns: |
| Columns of the matrix contain the correlation coefficient between transformed , a chi-square, and the two-sided p-value. |
| - <tt>rho FLOAT8[]</tt> - Array of correlation coefficient between survival time and the scaled Schoenfeld residuals |
| - <tt>chi_square FLOAT8[]</tt> - Chi-square test statistic for the |
| - <tt>p_value FLOAT8[]</tt> - Two-side p-value for the chi-square test |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_zph |
| ( |
| coxph_model_table VARCHAR, |
| output_table VARCHAR |
| ) |
| RETURNS VOID AS $$ |
| PythonFunctionBodyOnly(`stats', `cox_prop_hazards') |
| cox_prop_hazards.zph(schema_madlib, coxph_model_table, output_table) |
| $$ LANGUAGE plpythonu; |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_zph() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.cox_zph(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_zph( |
| message VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| PythonFunction(stats, cox_prop_hazards, zph_help_message) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| |
| -------------------------------------------------------------------------------- |
| |
| -- Internal functions and types------------------------------------------------- |
| ---------------------------------------------------------------------- |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.coxph_result; |
| CREATE TYPE MADLIB_SCHEMA.coxph_result AS ( |
| coef DOUBLE PRECISION[], |
| logLikelihood DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| z_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| hessian DOUBLE PRECISION[], |
| num_processed INTEGER |
| ); |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_step_final( |
| state DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_step_transition( |
| /*+ state */ DOUBLE PRECISION[], |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ y */ DOUBLE PRECISION, |
| /*+ status */ BOOLEAN, |
| /*+ coef */ DOUBLE PRECISION[] |
| ) |
| RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE; |
| |
| ------------------------------------ |
| |
| /** |
| * @internal |
| * @brief Perform one iteration the Newton-Rhapson method. |
| */ |
| CREATE |
| m4_ifdef(`__GREENPLUM__',m4_ifdef(`__HAS_ORDERED_AGGREGATES__',`ORDERED')) |
| AGGREGATE MADLIB_SCHEMA.coxph_step( |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ y */ DOUBLE PRECISION, |
| /*+ status */ BOOLEAN, |
| /*+ coef */ DOUBLE PRECISION[] |
| ) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.coxph_step_transition, |
| FINALFUNC=MADLIB_SCHEMA.coxph_step_final, |
| INITCOND='{0,0,0,0,0,0,0}' |
| ); |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_step_inner_final( |
| state DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ---------------------------- |
| |
| CREATE |
| m4_ifdef(`__GREENPLUM__',m4_ifdef(`__HAS_ORDERED_AGGREGATES__',`ORDERED')) |
| AGGREGATE MADLIB_SCHEMA.coxph_strata_step_inner( |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ y */ DOUBLE PRECISION, |
| /*+ status */ BOOLEAN, |
| /*+ coef */ DOUBLE PRECISION[] |
| ) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.coxph_step_transition, |
| FinalFunc = MADLIB_SCHEMA.coxph_step_inner_final, |
| INITCOND='{0,0,0,0,0,0,0}' |
| ); |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_step_outer_transition( |
| /*+ state1 */ DOUBLE PRECISION[], |
| /*+ state2 */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE; |
| |
| ------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.coxph_step_strata_final( |
| state DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------------------ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.coxph_strata_step_outer( |
| /*+ state */ DOUBLE PRECISION[] |
| ) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.coxph_step_outer_transition, |
| FINALFUNC=MADLIB_SCHEMA.coxph_step_strata_final |
| m4_ifdef( |
| `GREENPLUM', |
| `, prefunc=MADLIB_SCHEMA.coxph_step_outer_transition' |
| ) |
| ); |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_coxph_step_distance( |
| /*+ state1 */ DOUBLE PRECISION[], |
| /*+ state2 */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| |
| ---------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_coxph_result( |
| /*+ state */ DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.coxph_result AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT; |
| ---------------------------------------------------------------------- |
| |
| -- Schoenfeld residuals -------------------------------------------------------- |
| -- Transition function to be called for the Schoenfeld residuals |
| -- as part of an aggregate-defined window function |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__zph_transition( |
| /*+ state */ DOUBLE PRECISION[], |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ coef */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'zph_transition' |
| LANGUAGE c IMMUTABLE; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__zph_merge( |
| /*+ left_state */ DOUBLE PRECISION[], |
| /*+ right_state */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'zph_merge' |
| LANGUAGE c IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__zph_final( |
| /*+ left_state */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'zph_final' |
| LANGUAGE c IMMUTABLE; |
| |
| /** |
| * @internal |
| * @brief Aggregate for the Schoenfeld residuals |
| * (should be used as part of an aggregate-defined window function) |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.zph_agg( |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ coef */ DOUBLE PRECISION[]) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.__zph_transition, |
| FINALFUNC=MADLIB_SCHEMA.__zph_final |
| m4_ifdef(`__GREENPLUM__', `, PREFUNC = MADLIB_SCHEMA.__zph_merge') |
| ); |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__coxph_scale_resid( |
| m INTEGER, |
| hessian DOUBLE PRECISION[], |
| resid DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME', 'coxph_scale_resid' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------------------------------------------------------- |
| |
| -- TODO: Move these out of cox prop hazards module |
| -- Aggregates for array correlation--------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_elem_corr_transition( |
| /*+ state */ DOUBLE PRECISION[], |
| /*+ x */ DOUBLE PRECISION[], |
| /*+ y */ DOUBLE PRECISION) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'array_elem_corr_transition' |
| LANGUAGE c IMMUTABLE; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_elem_corr_merge( |
| /*+ left_state */ DOUBLE PRECISION[], |
| /*+ right_state */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'array_elem_corr_merge' |
| LANGUAGE c IMMUTABLE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_elem_corr_final( |
| /*+ state */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] AS |
| 'MODULE_PATHNAME', 'array_elem_corr_final' |
| LANGUAGE c IMMUTABLE; |
| |
| /** |
| * @internal |
| * @brief Correlation function to compute element-wise correlation between an |
| * array of variables with another scalar variable. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.array_elem_corr_agg( |
| /*+ array_input */ DOUBLE PRECISION[], |
| /*+ common_elem */ DOUBLE PRECISION) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.__array_elem_corr_transition, |
| FINALFUNC=MADLIB_SCHEMA.__array_elem_corr_final, |
| INITCOND='{0,0,0,0,0,0,0}' |
| m4_ifdef(`__GREENPLUM__', `, PREFUNC = MADLIB_SCHEMA.__array_elem_corr_merge') |
| ); |
| |
| -------------------------------------------------------------------------------- |
| -- Aggregates for computing various statistics for coxph ----------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__coxph_resid_stat_transition( |
| /*+ state */ DOUBLE PRECISION[], |
| /*+ w */ DOUBLE PRECISION, |
| /*+ residual */ DOUBLE PRECISION[], |
| /*+ hessian */ DOUBLE PRECISION[], |
| /*+ m */ INTEGER) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME', 'coxph_resid_stat_transition' |
| LANGUAGE c IMMUTABLE; |
| |
| ------------------------------------ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__coxph_resid_stat_merge( |
| /*+ state1 */ DOUBLE PRECISION[], |
| /*+ state2 */ DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME', 'coxph_resid_stat_merge' |
| LANGUAGE c IMMUTABLE; |
| |
| ------------------------------------ |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__cox_resid_stat_result; |
| CREATE TYPE MADLIB_SCHEMA.__cox_resid_stat_result AS |
| ( |
| chi_square_stat DOUBLE PRECISION[], |
| p_value DOUBLE PRECISION[] |
| ); |
| ------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__coxph_resid_stat_final( |
| state DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.__cox_resid_stat_result |
| AS 'MODULE_PATHNAME', 'coxph_resid_stat_final' |
| LANGUAGE C IMMUTABLE STRICT; |
| |
| ------------------------------------ |
| |
| CREATE AGGREGATE MADLIB_SCHEMA.__coxph_resid_stat_agg( |
| /*+ w */ DOUBLE PRECISION, |
| /*+ residual */ DOUBLE PRECISION[], |
| /*+ hessian */ DOUBLE PRECISION[], |
| /*+ m */ INTEGER |
| ) |
| ( |
| STYPE=DOUBLE PRECISION[], |
| SFUNC=MADLIB_SCHEMA.__coxph_resid_stat_transition, |
| FINALFUNC=MADLIB_SCHEMA.__coxph_resid_stat_final |
| m4_ifdef( |
| `GREENPLUM', |
| `, prefunc=MADLIB_SCHEMA.__coxph_resid_stat_merge' |
| ) |
| ); |
| |
| ---------------------------------------------------------------------- |
| -- Deprecated functions ----------------------------------------------------------- |
| |
| /** |
| * @brief Cox regresison training function |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_prop_hazards( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_varname VARCHAR -- name of dependent variable |
| , independent_varname VARCHAR -- name of independent variable |
| , status VARCHAR -- censoring status |
| ) |
| RETURNS VOID AS $$ |
| DECLARE |
| temp_string VARCHAR; |
| BEGIN |
| SELECT MADLIB_SCHEMA.__unique_string() into temp_string; |
| PERFORM MADLIB_SCHEMA.coxph_train($1, temp_string, $3, $4, $5); |
| EXECUTE ' |
| CREATE TABLE ' || out_table || |
| ' AS SELECT coef, std_err, z_stats, p_values FROM ' || |
| temp_string; |
| EXECUTE 'DROP TABLE ' || temp_string; |
| EXECUTE 'DROP TABLE ' || temp_string || '_summary'; |
| |
| END |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| /** |
| * @brief Cox regresison training function |
| **/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_prop_hazards( |
| source_table VARCHAR -- name of input table |
| , out_table VARCHAR -- name of output table |
| , dependent_variable VARCHAR -- name of dependent variable |
| , independent_variable VARCHAR -- name of independent variable |
| ) |
| RETURNS VOID AS $$ |
| BEGIN |
| EXECUTE 'SELECT MADLIB_SCHEMA.cox_prop_hazards(' || |
| ' ''' || source_table || ''' ' || |
| ' ,''' || out_table || ''' ' || |
| ' ,''' || dependent_variable || ''' ' || |
| ' ,''' || independent_variable || ''' ' || |
| ' , ''TRUE'')'; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_prop_hazards() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.cox_prop_hazards(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cox_prop_hazards( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| DECLARE |
| insert_string VARCHAR; |
| BEGIN |
| IF (usage_string = '') THEN |
| insert_string := '' || |
| E'Summary \n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' Functionality: Cox proprtional hazards regression (Breslow method)\n' || |
| E' SELECT {schema_madlib}.cox_prop_hazards(''source_table'' \n' || |
| E' ,''output_table'' \n' || |
| E' ,''dependent_variable'' \n' || |
| E' ,''independent_variable'' \n' || |
| E' ,''right_censoring_status'' \n' || |
| E' );\n' || |
| E'For more details on function usage: \n' || |
| E'SELECT {schema_madlib}.cox_prop_hazards(''usage'') \n' || |
| E''; |
| ELSIF (usage_string = 'usage' OR usage_string = 'help' OR usage_string = '?') THEN |
| insert_string := '' || |
| E'Usage\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' SELECT {schema_madlib}.cox_prop_hazards( \n' || |
| E' ''source_table'', -- Name of data table \n' || |
| E' ''output_table'', -- Name of result table (overwrites if exists) \n' || |
| E' ''dependent_variable'', -- Name of column for dependent variables\n' || |
| E' ''independent_variable'', -- Name of column for independent variables\n' || |
| E' (can be any SQL expression Eg: ''*'') \n' || |
| E' [''right_censoring_status'', -- Name of the column containing censoring status \n' || |
| E' 0/false : If the observation is censored\n' || |
| E' 1/true : otherwise\n' || |
| E' Default is 1/true for all observations\n' || |
| E' Can also be an SQL expression: ''dependent_variable < 10'') \n' || |
| E' );\n' || |
| E'\n' || |
| E'Output:\n' || |
| E'-----------------------------------------------------------------------------------------\n' || |
| E' The output table (''output_table'' above) has the following columns\n' || |
| E' ''coef'' DOUBLE PRECISION[], -- Coefficients of regression \n' || |
| E' ''std_err'' DOUBLE PRECISION[], -- Standard errors\n' || |
| E' ''z_stats'' DOUBLE PRECISION[], -- z-stats of the standard errors\n' || |
| E' ''p_values'' DOUBLE PRECISION[], -- p-values of the standard errors\n' || |
| E'\n' || |
| E''; |
| ELSE |
| insert_string := 'No such option. Run SELECT {schema_madlib}.cox_prop_hazards()'; |
| END IF; |
| RETURN insert_string; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| ------------------------------------------------------------------------- |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.cox_prop_hazards_result; |
| CREATE TYPE MADLIB_SCHEMA.cox_prop_hazards_result AS ( |
| coef DOUBLE PRECISION[], |
| logLikelihood DOUBLE PRECISION, |
| std_err DOUBLE PRECISION[], |
| z_stats DOUBLE PRECISION[], |
| p_values DOUBLE PRECISION[], |
| num_iterations INTEGER |
| ); |
| |
| |
| CREATE FUNCTION MADLIB_SCHEMA.cox_prop_hazards_regr( |
| "source" VARCHAR, |
| "indepColumn" VARCHAR, |
| "depColumn" VARCHAR, |
| "status" VARCHAR, |
| "maxNumIterations" INTEGER /*+ DEFAULT 20 */, |
| "optimizer" VARCHAR /*+ DEFAULT 'newton' */, |
| "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) |
| RETURNS MADLIB_SCHEMA.cox_prop_hazards_result AS $$ |
| PythonFunctionBodyOnly(`stats', `cox_prop_hazards') |
| optimizer_params = "max_iter=" + str(maxNumIterations) + \ |
| ", optimizer=" + str(optimizer) + \ |
| ", tolerance=" + str(precision) |
| temp_string = plpy.execute("SELECT {schema_madlib}.__unique_string() AS t". |
| format(schema_madlib=schema_madlib))[0]["t"] |
| |
| plpy.execute(""" |
| SELECT {schema_madlib}.coxph_train( |
| '{source}','{temp}', '{dep}', '{indep}', |
| '{status}', NULL, '{optimizer}') |
| """.format(schema_madlib=schema_madlib, |
| source=source, |
| temp=temp_string, |
| dep=depColumn, |
| indep=indepColumn, |
| status=status, |
| optimizer=optimizer_params)) |
| return plpy.execute("SELECT coef, loglikelihood, std_err, z_stats," |
| "p_values, num_iterations FROM {temp}". |
| format(temp=temp_string))[0] |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| |
| -- Note: The function cox_prop_hazards_regr has been deprecated but maintained |
| CREATE FUNCTION MADLIB_SCHEMA.cox_prop_hazards_regr( |
| "source" VARCHAR, |
| "indepColumn" VARCHAR, |
| "depColumn" VARCHAR, |
| "status" VARCHAR) |
| RETURNS MADLIB_SCHEMA.cox_prop_hazards_result AS |
| $$SELECT MADLIB_SCHEMA.cox_prop_hazards_regr($1, $2, $3, $4, 20, 'newton', 0.0001);$$ |
| LANGUAGE sql VOLATILE; |
| |
| -- Note: The function cox_prop_hazards_regr has been deprecated but maintained |
| CREATE FUNCTION MADLIB_SCHEMA.cox_prop_hazards_regr( |
| "source" VARCHAR, |
| "indepColumn" VARCHAR, |
| "depColumn" VARCHAR, |
| "status" VARCHAR, |
| "maxNumIterations" INTEGER) |
| RETURNS MADLIB_SCHEMA.cox_prop_hazards_result AS |
| $$SELECT MADLIB_SCHEMA.cox_prop_hazards_regr($1, $2, $3, $4, $5, 'newton', 0.0001);$$ |
| LANGUAGE sql VOLATILE; |
| |
| -- Note: The function cox_prop_hazards_regr has been deprecated but maintained |
| CREATE FUNCTION MADLIB_SCHEMA.cox_prop_hazards_regr( |
| "source" VARCHAR, |
| "indepColumn" VARCHAR, |
| "depColumn" VARCHAR, |
| "status" VARCHAR, |
| "maxNumIterations" INTEGER, |
| "optimizer" VARCHAR) |
| RETURNS MADLIB_SCHEMA.cox_prop_hazards_result AS |
| $$SELECT MADLIB_SCHEMA.cox_prop_hazards_regr($1, $2, $3, $4, $5, $6, 0.0001);$$ |
| LANGUAGE sql VOLATILE; |
| |