| # coding=utf-8 |
| |
| """ |
| @file logistic.py_in |
| |
| @brief Logistic Regression: Driver functions |
| |
| @namespace logistic |
| |
| Logistic Regression: Driver functions |
| """ |
| |
| import plpy |
| |
| def __runIterativeAlg(stateType, initialState, source, updateExpr, |
| terminateExpr, cyclesPerIteration, maxNumIterations): |
| """ |
| Driver for an iterative algorithm |
| |
| A general driver function for most iterative algorithms: The state between |
| iterations is kept in a variable of type <tt>stateType</tt>, which is |
| initialized with <tt><em>initialState</em></tt>. During each iteration, the |
| SQL statement <tt>updateSQL</tt> is executed in the database. Afterwards, |
| the SQL query <tt>updateSQL</tt> decides whether the algorithm terminates. |
| |
| @param stateType SQL type of the state between iterations |
| @param initialState The initial value of the SQL state variable |
| @param source The source relation |
| @param updateExpr SQL expression that returns the new state of type |
| <tt>stateType</tt>. The expression may use the replacement fields |
| <tt>"{state}"</tt>, <tt>"{iteration}"</tt>, and |
| <tt>"{sourceAlias}"</tt>. Source alias is an alias for the source |
| relation <tt><em>source</em></tt>. |
| @param terminateExpr SQL expression that returns whether the algorithm should |
| terminate. The expression may use the replacement fields |
| <tt>"{oldState}"</tt>, <tt>"{newState}"</tt>, and |
| <tt>"{iteration}"</tt>. It must return a BOOLEAN value. |
| @param cyclesPerIteration Number of aggregate function calls per iteration. |
| @param maxNumIterations Maximum number of iterations. Algorithm will then |
| terminate even when <tt>terminateExpr</tt> does not evaluate to \c true |
| """ |
| |
| state = "(st._madlib_state)" |
| sourceAlias = "src" |
| oldState = "(older._madlib_state)" |
| newState = "(newer._madlib_state)" |
| |
| updateExpr = updateExpr.format(**locals()) |
| terminateExpr = terminateExpr.format(**locals()) |
| |
| updateSQL = """ |
| INSERT INTO _madlib_iterative_alg |
| SELECT |
| {iteration}, |
| {updateExpr} |
| FROM |
| _madlib_iterative_alg AS st, |
| {source} AS src |
| WHERE |
| st._madlib_iteration = {iteration} - 1 |
| """ |
| terminateSQL = """ |
| SELECT |
| {terminateExpr} AS should_terminate |
| FROM |
| ( |
| SELECT _madlib_state |
| FROM _madlib_iterative_alg |
| WHERE _madlib_iteration = {iteration} - {cyclesPerIteration} |
| ) AS older, |
| ( |
| SELECT _madlib_state |
| FROM _madlib_iterative_alg |
| WHERE _madlib_iteration = {iteration} |
| ) AS newer |
| """ |
| |
| oldMsgLevel = plpy.execute("SELECT setting FROM pg_settings WHERE name='client_min_messages'" |
| )[0]['setting'] |
| plpy.execute(""" |
| SET client_min_messages = error; |
| DROP TABLE IF EXISTS _madlib_iterative_alg; |
| CREATE TEMPORARY TABLE _madlib_iterative_alg ( |
| _madlib_iteration INTEGER PRIMARY KEY, |
| _madlib_state {stateType} |
| ); |
| SET client_min_messages = {oldMsgLevel}; |
| """.format(**locals())) |
| |
| iteration = 0 |
| plpy.execute(""" |
| INSERT INTO _madlib_iterative_alg VALUES ({iteration}, {initialState}) |
| """.format(**locals())) |
| while True: |
| iteration = iteration + 1 |
| plpy.execute(updateSQL.format(**locals())) |
| if iteration > cyclesPerIteration and ( |
| iteration >= cyclesPerIteration * maxNumIterations or |
| plpy.execute(terminateSQL.format(**locals()))[0]['should_terminate'] |
| == True): |
| break |
| |
| # Note: We do not drop the temporary table |
| return iteration |
| |
| |
| def __cg_logregr(**kwargs): |
| """ |
| Logistic regression algorithm with the conjugate-gradient method |
| |
| The parameters are the same as for compute_logregr(), except that |
| <tt>optimizer</tt> should not be set. This function sets up all SQL |
| expression as needed for the conjugate-gradient method and then calls |
| __runIterativeAlg(). |
| """ |
| |
| stateType = "FLOAT8[]" |
| initialState = "NULL" |
| source = kwargs['source'] |
| |
| # "{state}", "{sourceAlias}", "{oldState}", and "{newState}" will not be |
| # substituted here but will be passed on to __runIterativeAlg and |
| # substituted there |
| updateExpr = """ |
| {MADlibSchema}.logregr_cg_step( |
| {depColumn}, |
| {indepColumn}, |
| {{state}} |
| ) |
| """.format(**kwargs) |
| if kwargs['precision'] == 0.: |
| terminateExpr = "FALSE" |
| else: |
| terminateExpr = """ |
| {MADlibSchema}.internal_logregr_cg_step_distance({{newState}}, {{oldState}}) < {precision} |
| """.format(**kwargs) |
| |
| cyclesPerIteration = 1 |
| maxNumIterations = kwargs['numIterations'] |
| return __runIterativeAlg(stateType, initialState, source, updateExpr, |
| terminateExpr, cyclesPerIteration, maxNumIterations) |
| |
| |
| def __irls__logregr(**kwargs): |
| """ |
| Logistic regression algorithm with the iteratively-reweighted-least-squares method |
| |
| The parameters are the same as for compute_logregr(), except that |
| <tt>optimizer</tt> should not be set. This function sets up all SQL |
| expression as needed for the iteratively-reweighted-least-squares method and |
| then calls __runIterativeAlg(). |
| """ |
| |
| stateType = "FLOAT8[]" |
| initialState = "NULL" |
| source = kwargs['source'] |
| updateExpr = """ |
| {MADlibSchema}.logregr_irls_step( |
| {depColumn}, |
| {indepColumn}, |
| {{state}} |
| ) |
| """.format(**kwargs) |
| if kwargs['precision'] == 0.: |
| terminateExpr = "FALSE" |
| else: |
| terminateExpr = """ |
| {MADlibSchema}.internal_logregr_irls_step_distance({{newState}}, {{oldState}}) < {precision} |
| """.format(**kwargs) |
| |
| cyclesPerIteration = 1 |
| maxNumIterations = kwargs['numIterations'] |
| return __runIterativeAlg(stateType, initialState, source, updateExpr, |
| terminateExpr, cyclesPerIteration, maxNumIterations) |
| |
| |
| def compute_logregr(**kwargs): |
| """ |
| Compute logistic regression coefficients |
| |
| This method serves as an interface to different optimization algorithms. |
| By default, iteratively reweighted least squares is used, but for data with |
| a lot of columns the conjugate-gradient method might perform better. |
| |
| @param source Name of relation containing the training data |
| @param depColumn Name of dependent column in training data (of type BOOLEAN) |
| @param indepColumn Name of independent column in training data (of type |
| DOUBLE PRECISION[]) |
| |
| Optionally also provide the following: |
| @param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively |
| reweighted least squares, 'cg': conjugate gradient (default = 'irls') |
| @param numIterations Maximum number of iterations (default = 20) |
| @param precision Terminate if two consecutive iterations have a difference |
| in the log-likelihood of less than <tt>precision</tt>. In other |
| words, we terminate if the objective function value has converged. |
| If this parameter is 0.0, then the algorithm will not check for |
| convergence and only terminate after <tt>numIterations</tt> |
| iterations. |
| |
| @return array with coefficients in case of convergence, otherwise None |
| |
| """ |
| if not 'optimizer' in kwargs: |
| kwargs.update(optimizer = 'irls') |
| if not 'numIterations' in kwargs: |
| kwargs.update(numIterations = 20) |
| if not 'precision' in kwargs: |
| kwargs.update(precision = 0.0001) |
| |
| if kwargs['optimizer'] == 'cg': |
| return __cg_logregr(**kwargs) |
| elif kwargs['optimizer'] in ['irls', 'newton']: |
| return __irls__logregr(**kwargs) |
| else: |
| plpy.error("Unknown optimizer requested. Must be 'newton'/'irls' or 'cg'") |
| |
| return None |