blob: 7680e2a2868bd48dd4076092ca45e8a82d0acb8b [file] [log] [blame]
# coding=utf-8
"""
@file logistic.py_in
@brief Logistic Regression: Driver functions
@namespace logistic
Logistic Regression: Driver functions
"""
import plpy
def __runIterativeAlg(stateType, initialState, source, updateExpr,
terminateExpr, cyclesPerIteration, maxNumIterations):
"""
Driver for an iterative algorithm
A general driver function for most iterative algorithms: The state between
iterations is kept in a variable of type <tt>stateType</tt>, which is
initialized with <tt><em>initialState</em></tt>. During each iteration, the
SQL statement <tt>updateSQL</tt> is executed in the database. Afterwards,
the SQL query <tt>updateSQL</tt> decides whether the algorithm terminates.
@param stateType SQL type of the state between iterations
@param initialState The initial value of the SQL state variable
@param source The source relation
@param updateExpr SQL expression that returns the new state of type
<tt>stateType</tt>. The expression may use the replacement fields
<tt>"{state}"</tt>, <tt>"{iteration}"</tt>, and
<tt>"{sourceAlias}"</tt>. Source alias is an alias for the source
relation <tt><em>source</em></tt>.
@param terminateExpr SQL expression that returns whether the algorithm should
terminate. The expression may use the replacement fields
<tt>"{oldState}"</tt>, <tt>"{newState}"</tt>, and
<tt>"{iteration}"</tt>. It must return a BOOLEAN value.
@param cyclesPerIteration Number of aggregate function calls per iteration.
@param maxNumIterations Maximum number of iterations. Algorithm will then
terminate even when <tt>terminateExpr</tt> does not evaluate to \c true
"""
state = "(st._madlib_state)"
sourceAlias = "src"
oldState = "(older._madlib_state)"
newState = "(newer._madlib_state)"
updateExpr = updateExpr.format(**locals())
terminateExpr = terminateExpr.format(**locals())
updateSQL = """
INSERT INTO _madlib_iterative_alg
SELECT
{iteration},
{updateExpr}
FROM
_madlib_iterative_alg AS st,
{source} AS src
WHERE
st._madlib_iteration = {iteration} - 1
"""
terminateSQL = """
SELECT
{terminateExpr} AS should_terminate
FROM
(
SELECT _madlib_state
FROM _madlib_iterative_alg
WHERE _madlib_iteration = {iteration} - {cyclesPerIteration}
) AS older,
(
SELECT _madlib_state
FROM _madlib_iterative_alg
WHERE _madlib_iteration = {iteration}
) AS newer
"""
oldMsgLevel = plpy.execute("SELECT setting FROM pg_settings WHERE name='client_min_messages'"
)[0]['setting']
plpy.execute("""
SET client_min_messages = error;
DROP TABLE IF EXISTS _madlib_iterative_alg;
CREATE TEMPORARY TABLE _madlib_iterative_alg (
_madlib_iteration INTEGER PRIMARY KEY,
_madlib_state {stateType}
);
SET client_min_messages = {oldMsgLevel};
""".format(**locals()))
iteration = 0
plpy.execute("""
INSERT INTO _madlib_iterative_alg VALUES ({iteration}, {initialState})
""".format(**locals()))
while True:
iteration = iteration + 1
plpy.execute(updateSQL.format(**locals()))
if iteration > cyclesPerIteration and (
iteration >= cyclesPerIteration * maxNumIterations or
plpy.execute(terminateSQL.format(**locals()))[0]['should_terminate']
== True):
break
# Note: We do not drop the temporary table
return iteration
def __cg_logregr(**kwargs):
"""
Logistic regression algorithm with the conjugate-gradient method
The parameters are the same as for compute_logregr(), except that
<tt>optimizer</tt> should not be set. This function sets up all SQL
expression as needed for the conjugate-gradient method and then calls
__runIterativeAlg().
"""
stateType = "FLOAT8[]"
initialState = "NULL"
source = kwargs['source']
# "{state}", "{sourceAlias}", "{oldState}", and "{newState}" will not be
# substituted here but will be passed on to __runIterativeAlg and
# substituted there
updateExpr = """
{MADlibSchema}.logregr_cg_step(
{depColumn},
{indepColumn},
{{state}}
)
""".format(**kwargs)
if kwargs['precision'] == 0.:
terminateExpr = "FALSE"
else:
terminateExpr = """
{MADlibSchema}.internal_logregr_cg_step_distance({{newState}}, {{oldState}}) < {precision}
""".format(**kwargs)
cyclesPerIteration = 1
maxNumIterations = kwargs['numIterations']
return __runIterativeAlg(stateType, initialState, source, updateExpr,
terminateExpr, cyclesPerIteration, maxNumIterations)
def __irls__logregr(**kwargs):
"""
Logistic regression algorithm with the iteratively-reweighted-least-squares method
The parameters are the same as for compute_logregr(), except that
<tt>optimizer</tt> should not be set. This function sets up all SQL
expression as needed for the iteratively-reweighted-least-squares method and
then calls __runIterativeAlg().
"""
stateType = "FLOAT8[]"
initialState = "NULL"
source = kwargs['source']
updateExpr = """
{MADlibSchema}.logregr_irls_step(
{depColumn},
{indepColumn},
{{state}}
)
""".format(**kwargs)
if kwargs['precision'] == 0.:
terminateExpr = "FALSE"
else:
terminateExpr = """
{MADlibSchema}.internal_logregr_irls_step_distance({{newState}}, {{oldState}}) < {precision}
""".format(**kwargs)
cyclesPerIteration = 1
maxNumIterations = kwargs['numIterations']
return __runIterativeAlg(stateType, initialState, source, updateExpr,
terminateExpr, cyclesPerIteration, maxNumIterations)
def compute_logregr(**kwargs):
"""
Compute logistic regression coefficients
This method serves as an interface to different optimization algorithms.
By default, iteratively reweighted least squares is used, but for data with
a lot of columns the conjugate-gradient method might perform better.
@param source Name of relation containing the training data
@param depColumn Name of dependent column in training data (of type BOOLEAN)
@param indepColumn Name of independent column in training data (of type
DOUBLE PRECISION[])
Optionally also provide the following:
@param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
reweighted least squares, 'cg': conjugate gradient (default = 'irls')
@param numIterations Maximum number of iterations (default = 20)
@param precision Terminate if two consecutive iterations have a difference
in the log-likelihood of less than <tt>precision</tt>. In other
words, we terminate if the objective function value has converged.
If this parameter is 0.0, then the algorithm will not check for
convergence and only terminate after <tt>numIterations</tt>
iterations.
@return array with coefficients in case of convergence, otherwise None
"""
if not 'optimizer' in kwargs:
kwargs.update(optimizer = 'irls')
if not 'numIterations' in kwargs:
kwargs.update(numIterations = 20)
if not 'precision' in kwargs:
kwargs.update(precision = 0.0001)
if kwargs['optimizer'] == 'cg':
return __cg_logregr(**kwargs)
elif kwargs['optimizer'] in ['irls', 'newton']:
return __irls__logregr(**kwargs)
else:
plpy.error("Unknown optimizer requested. Must be 'newton'/'irls' or 'cg'")
return None